Initial commit

parents

要显示的修改太多。

为保证性能只显示 1000 of 1000+ 个文件。

# JetBrains PyCharm IDE
.idea/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# macOS dir files
.DS_Store
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# data
/data
# Checkpoints
checkpoints
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# Generated files
/fairseq/temporal_convolution_tbc
/fairseq/modules/*_layer/*_forward.cu
/fairseq/modules/*_layer/*_backward.cu
/fairseq/version.py
# data
data-bin/
# reranking
/examples/reranking/rerank_data
# Cython-generated C++ source files
/fairseq/data/data_utils_fast.cpp
/fairseq/data/token_block_utils_fast.cpp
# VSCODE
.vscode/ftp-sync.json
.vscode/settings.json
# Experimental Folder
experimental/*
# Weights and Biases logs
wandb/
/examples/translation/iwslt14.tokenized.de-en/
/toy/
[submodule "fairseq/model_parallel/megatron"]
path = fairseq/model_parallel/megatron
url = https://github.com/ngoyal2707/Megatron-LM
branch = fairseq
{
// Use Intelligence to find out which attributes exist for node debugging.
// Use hover for the description of the existing attributes.
// For further information visit https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Remote Connection",
"type": "python",
"request": "attach",
"listen": {
"host": "0.0.0.0",
"port": 5678
},
"preLaunchTask": "Launch",
"pathMappings": [
{
"localRoot": "${workspaceFolder}",
"remoteRoot": "."
}
]
}
]
}
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"label": "Launch",
"type": "shell",
// hostname -i 可以自动获得当前 master 的IP,端口号与 launch.json listen 的保持一致
// 如果 debug 的文件想带上参数,加在 ${file} 后面即可,例如 ${file} --batch-size --model
// 如果想更改资源,在 launch 后加上资源配置参数即可,如 launch --gpu 2
// 如果是在 Python 虚拟环境里调试,要指定 python3 为虚拟环境里的 python3 路径
// 原理见文档 https://bytedance.feishu.cn/docx/doxcnZQgAWroFtQrvb1eB2jB3Lf
// MuST-C ST train
"command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/must_c/en-de/st_tok --config-yaml config_share.yaml --task speech_to_text --max-tokens 10000 --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --train-config /root/st/Fairseq-S2T/egs/mustc/st/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/mustc/st/conf/at12_big.yaml --train-config2 /root/st/Fairseq-S2T/egs/mustc/st/conf/conformer.yaml --train-config3 /root/st/Fairseq-S2T/egs/mustc/st/conf/ttr.yaml --fp16 --train-subset train_ttr_all_kd --valid-subset dev_ttr_all_kd --ctc-weight 0.2 --xctc-weight 0.2 --axctc-weight 0 --inter-ctc-weight 0.1 --inter-ctc-layers 6,9 --inter-xctc-weight 0.1 --inter-xctc-layers 9 --inter-axctc-weight 0.2 --inter-axctc-layers 6 --ctc-pae inter_league --adapter inter_league --ctc-pae-ground-truth-ratio 0.3 --xctc-pae-ground-truth-ratio 0.3 --xctc-pae-ground-truth-ratio-decay 50:100:0 --only-train-enc-prob 0.3 --ctc-masked-loss",
"command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/must_c/en-de/st --config-yaml config_share.yaml --task speech_to_text --max-tokens 10000 --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --train-config /root/st/Fairseq-S2T/egs/mustc/st/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/mustc/st/conf/xctc.yaml --train-config2 /root/st/Fairseq-S2T/egs/mustc/st/conf/ctc.yaml --train-config3 /root/st/Fairseq-S2T/egs/mustc/st/conf/purexctc.yaml --fp16 --inter-xctc-weight 0.1 --inter-xctc-layers 6,9 --ctc-pae inter_league --xctc-pae-ground-truth-ratio 0.3",
// MuST-C ST train
// "command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/must_c/en-de/st --config-yaml config_share.yaml --task speech_to_text --max-tokens 10000 --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --train-config /root/st/Fairseq-S2T/egs/mustc/st/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/mustc/st/conf/base.yaml --train-config2 /root/st/Fairseq-S2T/egs/mustc/st/conf/mixup.yaml --train-config3 /root/st/Fairseq-S2T/egs/mustc/st/conf/ctc.yaml --fp16",
// MuST-C ST Decoding
// "command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 /opt/tiger/s2t/fairseq_cli/generate.py /mnt/bd/data-model/data/must_c/en-de/st --config-yaml config_share.yaml --gen-subset dev --task speech_to_text --path /opt/tiger/s2t/checkpoints/must_c/en-de/st/3C/tca/sate_big_conformer_inter_ctc_text12_tca_parallel/checkpoint_best.pt --results-path /opt/tiger/s2t/checkpoints/must_c/en-de/st/3C/tca/sate_big_conformer_inter_ctc_text12_tca_parallel --max-tokens 80000 --beam 5 --lenpen 1.0 --scoring sacrebleu",
// MuST-C En-Ja ST Decoding
// "command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 /opt/tiger/s2t/fairseq_cli/generate.py /mnt/bd/data-model/data/must_c/en-ja/st_tok --config-yaml config.yaml --gen-subset dev --task speech_to_text --path /opt/tiger/s2t/checkpoints/must_c/en-ja/st/1122_tok_ae12_big_conformer_ctc_ttr_ctc_kd_ctc0.2_0_0.2_inter0.1_0.1_0.1_pae_adapter/checkpoint_best.pt --max-tokens 80000 --beam 5 --lenpen 1.0 --scoring sacrebleu",
// "command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/must_c/en-de/mt/st/data-bin --source-lang en --target-lang de --max-tokens 4000 --task translation --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/must_c/en-de/mt/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --train-config /root/st/Fairseq-S2T/egs/mustc/mt/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/mustc/mt/conf/small.yaml --train-config2 /root/st/Fairseq-S2T/egs/mustc/mt/conf/inter.yaml --train-config3 /root/st/Fairseq-S2T/egs/mustc/mt/conf/inter.yaml --fp16",
// Librispeech ASR train
// "command": "launch --gpu 1 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/librispeech/asr --config-yaml config.yaml --task speech_to_text --max-tokens 10000 --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/debug --train-config /root/st/Fairseq-S2T/egs/librispeech/asr/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/librispeech/asr/conf/EffecientConformerCTCSmall.yaml --fp16",
// "command": "launch --gpu 8 --cpu 40 --memory 80 -- python3 -m debugpy --connect $(hostname -i | awk '{print $2}'):5678 train.py /mnt/bd/data-model/data/must_c/en-de/st_nofil --config-yaml config_share.yaml --task speech_to_text --max-tokens 40000 --skip-invalid-size-inputs-valid-test --log-interval 1 --save-dir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --tensorboard-logdir /opt/tiger/s2t/checkpoints/must_c/en-de/st/debug --train-config /root/st/Fairseq-S2T/egs/mustc/st/conf/basis.yaml --train-config1 /root/st/Fairseq-S2T/egs/mustc/st/conf/base.yaml --train-config2 /root/st/Fairseq-S2T/egs/mustc/st/conf/ctc.yaml --train-config3 /root/st/Fairseq-S2T/egs/mustc/st/conf/ctc.yaml --fp16",
"isBackground": true,
"problemMatcher": {
"pattern": [
{
"regexp": ".",
"file": 1,
"location": 2,
"message": 3
}
],
"background": {
"activeOnStart": true,
"beginsPattern": ".",
"endsPattern": "."
}
},
"presentation": {
"reveal": "silent",
"clear": true
}
}
]
}
\ No newline at end of file
# Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at <conduct@pytorch.org>. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
# Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq)
We want to make contributing to this project as easy and transparent as
possible.
## Pull Requests
We actively welcome your pull requests.
1. Fork the repo and create your branch from `master`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
## License
By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
you agree that your contributions will be licensed under the LICENSE file in
the root directory of this source tree.
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# NiuTrans-Fairseq-S2T
This project adapts the [fairseq](https://github.com/pytorch/fairseq) toolkit for speech-to-text tasks, including speech recognition and speech translation.
It contains the implementation of the following methods proposed by NiuTrans Team.
[Stacked Acoustic-and-Textual Encoding: Integrating the Pre-trained Models into Speech Translation Encoders](https://arxiv.org/abs/2105.05752)
## Key Features
### Training
- Support the Kaldi-style complete recipes
- ASR, MT, and ST pipelines (bin)
- Read training config in yaml file
- CTC multi-task learning
- MT training in the ST-like way (Online tokenizer) (This may be slowly.)
- speed perturb during pre-processing
### Model
- Conformer Architecture
- Load pre-trained modules
- Relative position representation
- Stacked acoustic-and-textual encoding
- Progressive down-sampling for acoustic encoding
## Installation
* Note we only test the following environment.
1. Python == 3.6
2. torch == 1.8, torchaudio == 0.8.0, cuda == 10.2
3. apex
```
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
```
4. nccl
```
make -j src.build CUDA_HOME=<path to cuda install>
```
5. gcc ≥ 4.9 (We use the version 5.4)
6. python library
```
pip install pandas sentencepiece configargparse gpustat tensorboard editdistance
```
## Code Structure
We supply the recipes for multiple benchmarks in the egs folder, including machine translation, speech recognition, and speech translation corpora.
Besides, we also provide the template for other benchmarks.
Here is an example for MuST-C:
```markdown
mustc
├── asr
│   ├── binary.sh
│   ├── conf/
│   ├── decode.sh
│   ├── local/
│   ├── run.sh
│   └── train.sh
├── mt
│   ├── binary.sh
│   ├── conf/
│   ├── decode.sh
│   ├── local/
│   ├── run.sh
│   └── train.sh
└── st
├── binary.sh
├── conf/
├── decode.sh
├── local/
├── run.sh
└── train.sh
```
* run.sh: the core script that includes the whole pipeline
* train.sh: call the run.sh for training
* decode.sh: call the run.sh for decoding
* binary.sh: generate the datasets alone
* conf: the folder to save the configure files (.yaml).
* local: the folder to save utils
* monitor.sh: check the GPUS for running the program automatically
* parse_options.sh: parse the parameters for run.sh
* utils.sh: the util shell functions
## Citations
```bibtex
@inproceedings{xu-etal-2021-stacked,
title = "Stacked Acoustic-and-Textual Encoding: Integrating the Pre-trained Models into Speech Translation Encoders",
author = "Xu, Chen and
Hu, Bojie and
Li, Yanyang and
Zhang, Yuhao and
Huang, Shen and
Ju, Qi and
Xiao, Tong and
Zhu, Jingbo",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-long.204",
doi = "10.18653/v1/2021.acl-long.204",
pages = "2619--2630",
}
```
\ No newline at end of file
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python -msphinx
SPHINXPROJ = fairseq
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
.wy-table-responsive table td kbd {
white-space: nowrap;
}
.wy-table-responsive table td {
white-space: normal !important;
}
.wy-table-responsive {
overflow: visible !important;
}
.. _Command-line Tools:
Command-line Tools
==================
Fairseq provides several command-line tools for training and evaluating models:
- :ref:`fairseq-preprocess`: Data pre-processing: build vocabularies and binarize training data
- :ref:`fairseq-train`: Train a new model on one or multiple GPUs
- :ref:`fairseq-generate`: Translate pre-processed data with a trained model
- :ref:`fairseq-interactive`: Translate raw text with a trained model
- :ref:`fairseq-score`: BLEU scoring of generated translations against reference translations
- :ref:`fairseq-eval-lm`: Language model evaluation
.. _fairseq-preprocess:
fairseq-preprocess
~~~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.preprocess
.. argparse::
:module: fairseq.options
:func: get_preprocessing_parser
:prog: fairseq-preprocess
.. _fairseq-train:
fairseq-train
~~~~~~~~~~~~~
.. automodule:: fairseq_cli.train
.. argparse::
:module: fairseq.options
:func: get_training_parser
:prog: fairseq-train
.. _fairseq-generate:
fairseq-generate
~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.generate
.. argparse::
:module: fairseq.options
:func: get_generation_parser
:prog: fairseq-generate
.. _fairseq-interactive:
fairseq-interactive
~~~~~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.interactive
.. argparse::
:module: fairseq.options
:func: get_interactive_generation_parser
:prog: fairseq-interactive
.. _fairseq-score:
fairseq-score
~~~~~~~~~~~~~
.. automodule:: fairseq_cli.score
.. argparse::
:module: fairseq_cli.score
:func: get_parser
:prog: fairseq-score
.. _fairseq-eval-lm:
fairseq-eval-lm
~~~~~~~~~~~~~~~
.. automodule:: fairseq_cli.eval_lm
.. argparse::
:module: fairseq.options
:func: get_eval_lm_parser
:prog: fairseq-eval-lm
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# fairseq documentation build configuration file, created by
# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import os
import sys
from fairseq import __version__
# source code directory, relative to this file, for sphinx-autobuild
sys.path.insert(0, os.path.abspath(".."))
source_suffix = [".rst"]
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
"sphinx.ext.napoleon",
"sphinxarg.ext",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "fairseq"
copyright = "Facebook AI Research (FAIR)"
author = "Facebook AI Research (FAIR)"
github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = __version__
# The full version, including alpha/beta/rc tags.
release = __version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
highlight_language = "python"
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
html_context = {
"css_files": [
"_static/theme_overrides.css", # override wide tables in RTD theme
],
}
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# This is required for the alabaster theme
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
# html_sidebars = {
# '**': [
# 'about.html',
# 'navigation.html',
# 'relations.html', # needs 'show_related': True theme option to display
# 'searchbox.html',
# 'donate.html',
# ]
# }
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
"python": ("https://docs.python.org/", None),
"torch": ("https://pytorch.org/docs/master/", None),
}
.. role:: hidden
:class: hidden-section
.. _Criterions:
Criterions
==========
Criterions compute the loss function given the model and batch, roughly::
loss = criterion(model, batch)
.. automodule:: fairseq.criterions
:members:
.. autoclass:: fairseq.criterions.FairseqCriterion
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.adaptive_loss.AdaptiveLoss
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.composite_loss.CompositeLoss
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.cross_entropy.CrossEntropyCriterion
:members:
:undoc-members:
.. autoclass:: fairseq.criterions.label_smoothed_cross_entropy.LabelSmoothedCrossEntropyCriterion
:members:
:undoc-members:
.. role:: hidden
:class: hidden-section
.. module:: fairseq.data
Data Loading and Utilities
==========================
.. _datasets:
Datasets
--------
**Datasets** define the data format and provide helpers for creating
mini-batches.
.. autoclass:: fairseq.data.FairseqDataset
:members:
.. autoclass:: fairseq.data.LanguagePairDataset
:members:
.. autoclass:: fairseq.data.MonolingualDataset
:members:
**Helper Datasets**
These datasets wrap other :class:`fairseq.data.FairseqDataset` instances and
provide additional functionality:
.. autoclass:: fairseq.data.BacktranslationDataset
:members:
.. autoclass:: fairseq.data.ConcatDataset
:members:
.. autoclass:: fairseq.data.ResamplingDataset
:members:
.. autoclass:: fairseq.data.RoundRobinZipDatasets
:members:
.. autoclass:: fairseq.data.TransformEosDataset
:members:
Dictionary
----------
.. autoclass:: fairseq.data.Dictionary
:members:
Iterators
---------
.. autoclass:: fairseq.data.CountingIterator
:members:
.. autoclass:: fairseq.data.EpochBatchIterator
:members:
.. autoclass:: fairseq.data.GroupedIterator
:members:
.. autoclass:: fairseq.data.ShardedIterator
:members:
[writers]
option-limit=0
Evaluating Pre-trained Models
=============================
First, download a pre-trained model along with its vocabularies:
.. code-block:: console
> curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf -
This model uses a `Byte Pair Encoding (BPE)
vocabulary <https://arxiv.org/abs/1508.07909>`__, so we'll have to apply
the encoding to the source text before it can be translated. This can be
done with the
`apply\_bpe.py <https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/apply_bpe.py>`__
script using the ``wmt14.en-fr.fconv-cuda/bpecodes`` file. ``@@`` is
used as a continuation marker and the original text can be easily
recovered with e.g. ``sed s/@@ //g`` or by passing the ``--remove-bpe``
flag to :ref:`fairseq-generate`. Prior to BPE, input text needs to be tokenized
using ``tokenizer.perl`` from
`mosesdecoder <https://github.com/moses-smt/mosesdecoder>`__.
Let's use :ref:`fairseq-interactive` to generate translations interactively.
Here, we use a beam size of 5 and preprocess the input with the Moses
tokenizer and the given Byte-Pair Encoding vocabulary. It will automatically
remove the BPE continuation markers and detokenize the output.
.. code-block:: console
> MODEL_DIR=wmt14.en-fr.fconv-py
> fairseq-interactive \
--path $MODEL_DIR/model.pt $MODEL_DIR \
--beam 5 --source-lang en --target-lang fr \
--tokenizer moses \
--bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes
| loading model(s) from wmt14.en-fr.fconv-py/model.pt
| [en] dictionary: 44206 types
| [fr] dictionary: 44463 types
| Type the input sentence and press return:
Why is it rare to discover new marine mammal species?
S-0 Why is it rare to discover new marine mam@@ mal species ?
H-0 -0.0643349438905716 Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins?
P-0 -0.0763 -0.1849 -0.0956 -0.0946 -0.0735 -0.1150 -0.1301 -0.0042 -0.0321 -0.0171 -0.0052 -0.0062 -0.0015
This generation script produces three types of outputs: a line prefixed
with *O* is a copy of the original source sentence; *H* is the
hypothesis along with an average log-likelihood; and *P* is the
positional score per token position, including the
end-of-sentence marker which is omitted from the text.
Other types of output lines you might see are *D*, the detokenized hypothesis,
*T*, the reference target, *A*, alignment info, *E* the history of generation steps.
See the `README <https://github.com/pytorch/fairseq#pre-trained-models>`__ for a
full list of pre-trained models available.
Training a New Model
====================
The following tutorial is for machine translation. For an example of how
to use Fairseq for other tasks, such as :ref:`language modeling`, please see the
``examples/`` directory.
Data Pre-processing
-------------------
Fairseq contains example pre-processing scripts for several translation
datasets: IWSLT 2014 (German-English), WMT 2014 (English-French) and WMT
2014 (English-German). To pre-process and binarize the IWSLT dataset:
.. code-block:: console
> cd examples/translation/
> bash prepare-iwslt14.sh
> cd ../..
> TEXT=examples/translation/iwslt14.tokenized.de-en
> fairseq-preprocess --source-lang de --target-lang en \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--destdir data-bin/iwslt14.tokenized.de-en
This will write binarized data that can be used for model training to
``data-bin/iwslt14.tokenized.de-en``.
Training
--------
Use :ref:`fairseq-train` to train a new model. Here a few example settings that work
well for the IWSLT 2014 dataset:
.. code-block:: console
> mkdir -p checkpoints/fconv
> CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \
--optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
--arch fconv_iwslt_de_en --save-dir checkpoints/fconv
By default, :ref:`fairseq-train` will use all available GPUs on your machine. Use the
``CUDA_VISIBLE_DEVICES`` environment variable to select specific GPUs and/or to
change the number of GPU devices that will be used.
Also note that the batch size is specified in terms of the maximum
number of tokens per batch (``--max-tokens``). You may need to use a
smaller value depending on the available GPU memory on your system.
Generation
----------
Once your model is trained, you can generate translations using
:ref:`fairseq-generate` **(for binarized data)** or
:ref:`fairseq-interactive` **(for raw text)**:
.. code-block:: console
> fairseq-generate data-bin/iwslt14.tokenized.de-en \
--path checkpoints/fconv/checkpoint_best.pt \
--batch-size 128 --beam 5
| [de] dictionary: 35475 types
| [en] dictionary: 24739 types
| data-bin/iwslt14.tokenized.de-en test 6750 examples
| model fconv
| loaded checkpoint trainings/fconv/checkpoint_best.pt
S-721 danke .
T-721 thank you .
...
To generate translations with only a CPU, use the ``--cpu`` flag. BPE
continuation markers can be removed with the ``--remove-bpe`` flag.
Advanced Training Options
=========================
Large mini-batch training with delayed updates
----------------------------------------------
The ``--update-freq`` option can be used to accumulate gradients from
multiple mini-batches and delay updating, creating a larger effective
batch size. Delayed updates can also improve training speed by reducing
inter-GPU communication costs and by saving idle time caused by variance
in workload across GPUs. See `Ott et al.
(2018) <https://arxiv.org/abs/1806.00187>`__ for more details.
To train on a single GPU with an effective batch size that is equivalent
to training on 8 GPUs:
.. code-block:: console
> CUDA_VISIBLE_DEVICES=0 fairseq-train --update-freq 8 (...)
Training with half precision floating point (FP16)
--------------------------------------------------
.. note::
FP16 training requires a Volta GPU and CUDA 9.1 or greater
Recent GPUs enable efficient half precision floating point computation,
e.g., using `Nvidia Tensor Cores
<https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__.
Fairseq supports FP16 training with the ``--fp16`` flag:
.. code-block:: console
> fairseq-train --fp16 (...)
Distributed training
--------------------
Distributed training in fairseq is implemented on top of ``torch.distributed``.
The easiest way to launch jobs is with the `torch.distributed.launch
<https://pytorch.org/docs/stable/distributed.html#launch-utility>`__ tool.
For example, to train a large English-German Transformer model on 2 nodes each
with 8 GPUs (in total 16 GPUs), run the following command on each node,
replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
sure to update ``--master_addr`` to the IP address of the first node:
.. code-block:: console
> python -m torch.distributed.launch --nproc_per_node=8 \
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
--master_port=12345 \
$(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
--lr 0.0005 \
--dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 3584 \
--max-epoch 70 \
--fp16
On SLURM clusters, fairseq will automatically detect the number of nodes and
GPUs, but a port number must be provided:
.. code-block:: console
> salloc --gpus=16 --nodes 2 (...)
> srun fairseq-train --distributed-port 12345 (...).
Sharding very large datasets
----------------------------
It can be challenging to train over very large datasets, particularly if your
machine does not have much system RAM. Most tasks in fairseq support training
over "sharded" datasets, in which the original dataset has been preprocessed
into non-overlapping chunks (or "shards").
For example, instead of preprocessing all your data into a single "data-bin"
directory, you can split the data and create "data-bin1", "data-bin2", etc.
Then you can adapt your training command like so:
.. code-block:: console
> fairseq-train data-bin1:data-bin2:data-bin3 (...)
Training will now iterate over each shard, one by one, with each shard
corresponding to an "epoch", thus reducing system memory usage.
.. fairseq documentation master file, created by
sphinx-quickstart on Fri Aug 17 21:45:30 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
:github_url: https://github.com/pytorch/fairseq
fairseq documentation
=====================
Fairseq is a sequence modeling toolkit written in `PyTorch
<http://pytorch.org/>`_ that allows researchers and developers to
train custom models for translation, summarization, language modeling and other
text generation tasks.
.. toctree::
:maxdepth: 1
:caption: Getting Started
getting_started
command_line_tools
.. toctree::
:maxdepth: 1
:caption: Extending Fairseq
overview
tutorial_simple_lstm
tutorial_classifying_names
.. toctree::
:maxdepth: 2
:caption: Library Reference
tasks
models
criterions
optim
lr_scheduler
data
modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`search`
.. role:: hidden
:class: hidden-section
.. _Learning Rate Schedulers:
Learning Rate Schedulers
========================
Learning Rate Schedulers update the learning rate over the course of training.
Learning rates can be updated after each update via :func:`step_update` or at
epoch boundaries via :func:`step`.
.. automodule:: fairseq.optim.lr_scheduler
:members:
.. autoclass:: fairseq.optim.lr_scheduler.FairseqLRScheduler
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.cosine_lr_scheduler.CosineSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.fixed_schedule.FixedSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.inverse_square_root_schedule.InverseSquareRootSchedule
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.reduce_lr_on_plateau.ReduceLROnPlateau
:members:
:undoc-members:
.. autoclass:: fairseq.optim.lr_scheduler.triangular_lr_scheduler.TriangularSchedule
:members:
:undoc-members:
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=python -msphinx
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=fairseq
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The Sphinx module was not found. Make sure you have Sphinx installed,
echo.then set the SPHINXBUILD environment variable to point to the full
echo.path of the 'sphinx-build' executable. Alternatively you may add the
echo.Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
.. role:: hidden
:class: hidden-section
.. module:: fairseq.models
.. _Models:
Models
======
A Model defines the neural network's ``forward()`` method and encapsulates all
of the learnable parameters in the network. Each model also provides a set of
named *architectures* that define the precise network configuration (e.g.,
embedding dimension, number of layers, etc.).
Both the model type and architecture are selected via the ``--arch``
command-line argument. Once selected, a model may expose additional command-line
arguments for further configuration.
.. note::
All fairseq Models extend :class:`BaseFairseqModel`, which in turn extends
:class:`torch.nn.Module`. Thus any fairseq Model can be used as a
stand-alone Module in other PyTorch code.
Convolutional Neural Networks (CNN)
-----------------------------------
.. module:: fairseq.models.fconv
.. autoclass:: fairseq.models.fconv.FConvModel
:members:
.. autoclass:: fairseq.models.fconv.FConvEncoder
:members:
:undoc-members:
.. autoclass:: fairseq.models.fconv.FConvDecoder
:members:
Long Short-Term Memory (LSTM) networks
--------------------------------------
.. module:: fairseq.models.lstm
.. autoclass:: fairseq.models.lstm.LSTMModel
:members:
.. autoclass:: fairseq.models.lstm.LSTMEncoder
:members:
.. autoclass:: fairseq.models.lstm.LSTMDecoder
:members:
Transformer (self-attention) networks
-------------------------------------
.. module:: fairseq.models.transformer
.. autoclass:: fairseq.models.transformer.TransformerModel
:members:
.. autoclass:: fairseq.models.transformer.TransformerEncoder
:members:
.. autoclass:: fairseq.models.transformer.TransformerEncoderLayer
:members:
.. autoclass:: fairseq.models.transformer.TransformerDecoder
:members:
.. autoclass:: fairseq.models.transformer.TransformerDecoderLayer
:members:
Adding new models
-----------------
.. currentmodule:: fairseq.models
.. autofunction:: fairseq.models.register_model
.. autofunction:: fairseq.models.register_model_architecture
.. autoclass:: fairseq.models.BaseFairseqModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoderDecoderModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoderModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqLanguageModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqMultiModel
:members:
:undoc-members:
.. autoclass:: fairseq.models.FairseqEncoder
:members:
.. autoclass:: fairseq.models.CompositeEncoder
:members:
.. autoclass:: fairseq.models.FairseqDecoder
:members:
.. _Incremental decoding:
Incremental decoding
--------------------
.. autoclass:: fairseq.models.FairseqIncrementalDecoder
:members:
:undoc-members:
Modules
=======
Fairseq provides several stand-alone :class:`torch.nn.Module` classes that may
be helpful when implementing a new :class:`~fairseq.models.BaseFairseqModel`.
.. automodule:: fairseq.modules
:members:
:undoc-members:
.. role:: hidden
:class: hidden-section
.. _optimizers:
Optimizers
==========
Optimizers update the Model parameters based on the gradients.
.. automodule:: fairseq.optim
:members:
.. autoclass:: fairseq.optim.FairseqOptimizer
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adadelta.Adadelta
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adagrad.Adagrad
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adafactor.FairseqAdafactor
:members:
:undoc-members:
.. autoclass:: fairseq.optim.adam.FairseqAdam
:members:
:undoc-members:
.. autoclass:: fairseq.optim.fp16_optimizer.FP16Optimizer
:members:
:undoc-members:
.. autoclass:: fairseq.optim.nag.FairseqNAG
:members:
:undoc-members:
.. autoclass:: fairseq.optim.sgd.SGD
:members:
:undoc-members:
Overview
========
Fairseq can be extended through user-supplied `plug-ins
<https://en.wikipedia.org/wiki/Plug-in_(computing)>`_. We support five kinds of
plug-ins:
- :ref:`Models` define the neural network architecture and encapsulate all of the
learnable parameters.
- :ref:`Criterions` compute the loss function given the model outputs and targets.
- :ref:`Tasks` store dictionaries and provide helpers for loading/iterating over
Datasets, initializing the Model/Criterion and calculating the loss.
- :ref:`Optimizers` update the Model parameters based on the gradients.
- :ref:`Learning Rate Schedulers` update the learning rate over the course of
training.
**Training Flow**
Given a ``model``, ``criterion``, ``task``, ``optimizer`` and ``lr_scheduler``,
fairseq implements the following high-level training flow::
for epoch in range(num_epochs):
itr = task.get_batch_iterator(task.dataset('train'))
for num_updates, batch in enumerate(itr):
task.train_step(batch, model, criterion, optimizer)
average_and_clip_gradients()
optimizer.step()
lr_scheduler.step_update(num_updates)
lr_scheduler.step(epoch)
where the default implementation for ``task.train_step`` is roughly::
def train_step(self, batch, model, criterion, optimizer, **unused):
loss = criterion(model, batch)
optimizer.backward(loss)
return loss
**Registering new plug-ins**
New plug-ins are *registered* through a set of ``@register`` function
decorators, for example::
@register_model('my_lstm')
class MyLSTM(FairseqEncoderDecoderModel):
(...)
Once registered, new plug-ins can be used with the existing :ref:`Command-line
Tools`. See the Tutorial sections for more detailed walkthroughs of how to add
new plug-ins.
**Loading plug-ins from another directory**
New plug-ins can be defined in a custom module stored in the user system. In
order to import the module, and make the plugin available to *fairseq*, the
command line supports the ``--user-dir`` flag that can be used to specify a
custom location for additional modules to load into *fairseq*.
For example, assuming this directory tree::
/home/user/my-module/
└── __init__.py
with ``__init__.py``::
from fairseq.models import register_model_architecture
from fairseq.models.transformer import transformer_vaswani_wmt_en_de_big
@register_model_architecture('transformer', 'my_transformer')
def transformer_mmt_big(args):
transformer_vaswani_wmt_en_de_big(args)
it is possible to invoke the :ref:`fairseq-train` script with the new architecture with::
fairseq-train ... --user-dir /home/user/my-module -a my_transformer --task translation
.. role:: hidden
:class: hidden-section
.. module:: fairseq.tasks
.. _Tasks:
Tasks
=====
Tasks store dictionaries and provide helpers for loading/iterating over
Datasets, initializing the Model/Criterion and calculating the loss.
Tasks can be selected via the ``--task`` command-line argument. Once selected, a
task may expose additional command-line arguments for further configuration.
Example usage::
# setup the task (e.g., load dictionaries)
task = fairseq.tasks.setup_task(args)
# build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args)
# load datasets
task.load_dataset('train')
task.load_dataset('valid')
# iterate over mini-batches of data
batch_itr = task.get_batch_iterator(
task.dataset('train'), max_tokens=4096,
)
for batch in batch_itr:
# compute the loss
loss, sample_size, logging_output = task.get_loss(
model, criterion, batch,
)
loss.backward()
Translation
-----------
.. autoclass:: fairseq.tasks.translation.TranslationTask
.. _language modeling:
Language Modeling
-----------------
.. autoclass:: fairseq.tasks.language_modeling.LanguageModelingTask
Adding new tasks
----------------
.. autofunction:: fairseq.tasks.register_task
.. autoclass:: fairseq.tasks.FairseqTask
:members:
:undoc-members:
set -e
eval=1
lcrm=0
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=~/st/data/test
vocab_dir=~/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
subsets=(2019)
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
post-process: sentencepiece
# best-checkpoint-metric: loss
# maximize-best-checkpoint-metric: False
eval-wer: True
eval-wer-args: {"beam": 1, "lenpen": 1.0}
eval-wer-tok-args: {"wer_remove_punct": true, "wer_lowercase": true, "wer_char_level": true}
eval-wer-remove-bpe: sentencepiece
eval-wer-print-samples: True
best_checkpoint_metric: dec_wer
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
# keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0014
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv2d
subsampling-layers: 2
subsampling-filter: 512
subsampling-kernel: 3
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: relu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
cnn-module-norm: layer_norm
load-pretrained-encoder-from: /home/xuchen/after.pt
load-pretrained-decoder-from: /home/xuchen/after.pt
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
ctc-weight: 0.3
# share-ctc-and-embed: True
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
compression-metric: threshold
compression-mode: create
compression-layers: 6,9
compression-threshold: 0.95
compression-norm: True
compression-pos: True
\ No newline at end of file
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
# ctc-pae-ground-truth-ratio: 0.1
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-ctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.5
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
inter-ctc-mlo: 1:2:3
\ No newline at end of file
encoder-embed-norm: True
encoder-no-scale-embedding: True
\ No newline at end of file
arch: pdss2t_transformer_s_8
pds-fusion: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0014
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_ctc
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 4
#load-pretrained-encoder-from:
\ No newline at end of file
encoder-attention-type: rel_selfattn
# encoder-attention-type: relative
# max-encoder-relative-length: 100
\ No newline at end of file
#!/usr/bin/env bash
gpu_num=1
data_dir=
test_subset=(dev test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
cer=1
ctc_infer=0
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 3
--stop_stage 3
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--cer ${cer}
--ctc_infer ${ctc_infer}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
#!/usr/bin/env bash
set -e
ref=$1
gen=$2
tokenizer=$3
lang=$4
lang_pair=en-${lang}
record=$(mktemp -t temp.record.XXXXXX)
if [[ ${tokenizer} -eq 1 ]]; then
echo "MultiBLEU" > ${record}
cmd="multi-bleu.perl ${ref} < ${gen}"
eval $cmd | head -n 1 >> ${record}
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${ref} > ${ref}.detok"
eval $cmd
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${gen} > ${gen}.detok"
eval $cmd
ref=${ref}.detok
gen=${gen}.detok
fi
echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair}"
eval $cmd >> ${record}
cat ${record}
rm ${record}
\ No newline at end of file
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
tokenizer=$6
lang=$7
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
if [[ ! -f ${ctc_infer_sort} ]]; then
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
fi
gen=${ctc_infer_sort}
./cal_bleu.sh ${ref} ${gen} ${tokenizer} ${lang}
\ No newline at end of file
import unicodedata
import jiwer
import jiwer.transforms as tr
import sys
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
wer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
tr.ReduceToListOfListOfWords(),
]
)
cer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ReduceToListOfListOfChars(),
]
)
ref_lines = open(ref_file, "r").readlines()
hyp_lines = open(hyp_file, "r").readlines()
wer = jiwer.wer(ref_lines, hyp_lines,
truth_transform=wer_standardize,
hypothesis_transform=wer_standardize,
)
cer = jiwer.cer(ref_lines, hyp_lines,
truth_transform=cer_standardize,
hypothesis_transform=cer_standardize,
)
print("WER: %.4f" % wer)
print("CER: %.4f" % cer)
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
python3 ./cal_wer.py ${ref} ${ctc_infer_sort}
\ No newline at end of file
import sys
import csv
tsv_file = sys.argv[1]
out_file = sys.argv[2]
extract_item = sys.argv[3]
with open(tsv_file) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
samples = [dict(e) for e in reader]
fw = open(out_file, "w", encoding="utf-8")
for s in samples:
if extract_item in s:
fw.write("%s\n" % s[extract_item])
else:
print("Error in sample: ")
print(s)
exit()
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
i=1
argv="$@"
while true; do
key=${!i}
j=$(($i + 1))
value=${!j}
[ -z "${!i:-}" ] && break; # break if there are no arguments
case "${key}" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '${key}}'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "${key}" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
#eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
# echo $name
eval $name=\"${value}\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "${value}" != "true" && "${value}" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": ${key} ${value}" 1>&2
exit 1;
fi
# shift 2;
i=$(($i + 2))
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# training the model
gpu_num=2
update_freq=1
max_tokens=160000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
# CTC
config_list=(purectc)
# Transformer
config_list=(base ctc)
# Conformer
#config_list=(base conformer ctc)
# PDS
config_list=(purectc_pds_base_8)
config_list=(pds_base_8)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(2019)
splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
post-process: sentencepiece
#best_checkpoint_metric: loss
#maximize_best_checkpoint_metric: False
eval-wer: True
eval-wer-args: {"beam": 5, "lenpen": 1.0}
eval-wer-tok-args: {"wer_remove_punct": true, "wer_lowercase": true, "wer_char_level": false}
best_checkpoint_metric: dec_wer
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
ctc-weight: 0.3
share-ctc-and-embed: True
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
# ctc-weight: 0.3
share-ctc-and-embed: True
share-target-ctc-and-embed: True
interleaved-ctc-weight: 0.2
interleaved-ctc-layers: 6,9
interleaved-ctc-drop-prob: 0
share-interleaved-ctc: True
sae-adapter: inter_league
#sae-adapter: none
#target-sae-adapter: none
sae-ctc-temperature: 1
#sae-gumbel: True
#sae-distribution-hard: True
#sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
#share-sae-and-ctc: True
#share-target-sae-and-ctc: True
#sae-embed-norm: True
#sae-out-norm: True
#ctc-self-distill-weight: 1
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.5
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
arch: pdss2t_transformer_s_8
pds-fusion: True
#ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
#ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
#ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 0.1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 256
pds-stages: 4
#ctc-layer: 12
pds-layers: 4_2_2_4
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_s_8
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 0.1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 384
pds-stages: 4
#ctc-layer: 15
encoder-layers: 12
pds-layers: 4_3_3_2
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 192_256_256_384
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_4
pds-attn-heads: 4_4_4_6
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
decoder-layers: 6
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_s_8
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 0.1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 512
pds-stages: 4
#ctc-layer: 15
encoder-layers: 18
pds-layers: 6_3_3_6
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_384_384_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_4_4_4
pds-attn-heads: 4_6_6_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
decoder-layers: 6
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
#ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_ctc
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
encoder-attention-heads: 4
#load-pretrained-encoder-from:
\ No newline at end of file
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 8_4_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 256
pds-stages: 4
#ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
encoder-attention-heads: 4
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 200
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 200_200_200
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-layers: 12
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 15
#encoder-activation-fn: swish
#encoder-attention-type: rel_pos
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#intermedia-temperature: 5
#encoder-attention-type: rel_pos
#encoder-attention-type: reduced_rel_pos
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 384
pds-stages: 4
#ctc-layer: 15
encoder-layers: 12
pds-layers: 4_3_3_2
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 128_256_256_384
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_4
pds-attn-heads: 4_4_4_8
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.002
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#intermedia-temperature: 5
encoder-attention-type: rel_pos
#encoder-attention-type: reduced_rel_pos
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 512
pds-stages: 4
#ctc-layer: 15
encoder-layers: 10
pds-layers: 3_2_2_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_384_384_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_4_4_4
pds-attn-heads: 4_6_6_8
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.002
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
encoder-activation-fn: swish
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 240
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-layers: 12
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 15
#encoder-activation-fn: swish
#encoder-attention-type: rel_pos
#load-pretrained-encoder-from:
encoder-attention-type: rel_pos
#encoder-attention-type: relative
#max-encoder-relative-length: 100
#!/usr/bin/env bash
gpu_num=1
data_dir=
test_subset=(dev tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
cer=0
ctc_infer=0
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 3
--stop_stage 3
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--cer ${cer}
--ctc_infer ${ctc_infer}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
#!/usr/bin/env bash
set -e
ref=$1
gen=$2
tokenizer=$3
lang=$4
lang_pair=en-${lang}
record=$(mktemp -t temp.record.XXXXXX)
if [[ ${tokenizer} -eq 1 ]]; then
echo "MultiBLEU" > ${record}
cmd="multi-bleu.perl ${ref} < ${gen}"
eval $cmd | head -n 1 >> ${record}
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${ref} > ${ref}.detok"
eval $cmd
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${gen} > ${gen}.detok"
eval $cmd
ref=${ref}.detok
gen=${gen}.detok
fi
echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair}"
eval $cmd >> ${record}
cat ${record}
rm ${record}
\ No newline at end of file
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
tokenizer=$6
lang=$7
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
if [[ ! -f ${ctc_infer_sort} ]]; then
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
fi
gen=${ctc_infer_sort}
./cal_bleu.sh ${ref} ${gen} ${tokenizer} ${lang}
\ No newline at end of file
import unicodedata
import jiwer
import jiwer.transforms as tr
import sys
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
wer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
tr.ReduceToListOfListOfWords(),
]
)
cer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ReduceToListOfListOfChars(),
]
)
ref_lines = open(ref_file, "r").readlines()
hyp_lines = open(hyp_file, "r").readlines()
wer = jiwer.wer(ref_lines, hyp_lines,
truth_transform=wer_standardize,
hypothesis_transform=wer_standardize,
)
cer = jiwer.cer(ref_lines, hyp_lines,
truth_transform=cer_standardize,
hypothesis_transform=cer_standardize,
)
print("WER: %.4f" % wer)
print("CER: %.4f" % cer)
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
python3 ./cal_wer.py ${ref} ${ctc_infer_sort}
\ No newline at end of file
import sys
import csv
tsv_file = sys.argv[1]
out_file = sys.argv[2]
extract_item = sys.argv[3]
with open(tsv_file) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
samples = [dict(e) for e in reader]
fw = open(out_file, "w", encoding="utf-8")
for s in samples:
if extract_item in s:
fw.write("%s\n" % s[extract_item])
else:
print("Error in sample: ")
print(s)
exit()
#!/usr/bin/env bash
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
i=1
argv="$@"
while true; do
key=${!i}
j=$(($i + 1))
value=${!j}
[ -z "${!i:-}" ] && break; # break if there are no arguments
case "${key}" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '${key}}'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "${key}" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
#eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
# echo $name
eval $name=\"${value}\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "${value}" != "true" && "${value}" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": ${key} ${value}" 1>&2
exit 1;
fi
# shift 2;
i=$(($i + 2))
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
# CTC
# config_list=(purectc)
# Transformer
config_list=(base ctc)
# Conformer
#config_list=(base conformer ctc)
# PDS
# config_list=(purectc_pds_base_8)
# config_list=(pds_base_8)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 4
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
#!/usr/bin/env bash
set -e
eval=1
lcrm=0
src_lang=en
tgt_lang=de
tokenize=0
splits=(train dev)
lang=${src_lang}-${tgt_lang}
root_dir=/opt/tiger/s2t
data_dir=/mnt/bd/data-model/data/must_c/${lang}/mt/st_tok/infer
vocab_dir=/root/st/data/must_c/${lang}/mt/st_tok
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
#src_vocab_prefix=spm_unigram5000_asr
#tgt_vocab_prefix=spm_unigram10000_st
for split in ${splits[@]}; do
src_file=${data_dir}/${split}.${src_lang}
tgt_file=${data_dir}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.tok
tgt_file=${tgt_file}.tok
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${data_dir}/final
cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${data_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
mkdir -p ${data_dir}/data-bin
cd ${root_dir}
cmd="python3 ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${data_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
差异被折叠。 点击展开。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论