Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
F
Fairseq-S2T
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
xuchen
Fairseq-S2T
Commits
33c67f90
Commit
33c67f90
authored
Feb 22, 2022
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix the bugs
parent
1f8d6f6c
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
30 行增加
和
12 行删除
+30
-12
fairseq/criterions/ctc.py
+3
-3
fairseq/models/speech_to_text/s2t_ctc.py
+2
-3
fairseq/models/speech_to_text/s2t_sate.py
+0
-1
fairseq/models/transformer.py
+17
-1
fairseq/modules/dlcl.py
+4
-4
fairseq/modules/speech_to_text/subsampling.py
+4
-0
没有找到文件。
fairseq/criterions/ctc.py
查看文件 @
33c67f90
...
...
@@ -176,9 +176,9 @@ class CtcCriterion(FairseqCriterion):
from
torch.distributions
import
Categorical
# ctc_logit = ctc_logit.sort(dim=-1, descending=True)[0][:, :, 0:100]
# ctc_logit = ctc_logit / ctc_logit.sum(dim=-1, keepdim=True)
cut_ctc_logit
=
ctc_logit
.
sort
(
dim
=-
1
,
descending
=
True
)[
0
][:,
:,
0
:
100
]
ctc_entropy
=
Categorical
(
logits
=
cut_ctc_logit
)
.
entropy
()
.
sum
()
#
ctc_entropy = Categorical(logits=ctc_logit).entropy().sum()
#
cut_ctc_logit = ctc_logit.sort(dim=-1, descending=True)[0][:, :, 0:100]
#
ctc_entropy = Categorical(logits=cut_ctc_logit).entropy().sum()
ctc_entropy
=
Categorical
(
logits
=
ctc_logit
)
.
entropy
()
.
sum
()
logging_output
[
"ctc_entropy"
]
=
utils
.
item
(
ctc_entropy
.
data
)
logging_output
[
"ctc_loss"
]
=
utils
.
item
(
ctc_loss
.
data
)
...
...
fairseq/models/speech_to_text/s2t_ctc.py
查看文件 @
33c67f90
...
...
@@ -19,7 +19,6 @@ from fairseq.modules import (
FairseqDropout
,
LayerNorm
,
PositionalEmbedding
,
PositionalEncoding
,
LegacyRelPositionalEncoding
,
RelPositionalEncoding
,
S2TTransformerEncoderLayer
,
...
...
@@ -464,7 +463,7 @@ class S2TCTCEncoder(FairseqEncoder):
self
.
embed_positions
=
RelPositionalEncoding
(
args
.
max_source_positions
,
args
.
encoder_embed_dim
)
elif
self
.
attn_type
==
"rel_selfattn"
:
elif
self
.
attn_type
in
[
"rel_selfattn"
,
"rel_pos_legacy"
]
:
self
.
embed_positions
=
LegacyRelPositionalEncoding
(
args
.
encoder_embed_dim
,
args
.
dropout
,
args
.
max_source_positions
)
...
...
@@ -560,7 +559,7 @@ class S2TCTCEncoder(FairseqEncoder):
# padding and position embedding
encoder_padding_mask
=
lengths_to_padding_mask
(
input_lengths
)
if
self
.
attn_type
==
"rel_pos"
or
self
.
attn_type
==
"rel_selfattn"
:
if
self
.
attn_type
in
[
"rel_selfattn"
,
"rel_pos"
,
"rel_pos_legacy"
]
:
positions
=
self
.
embed_positions
(
x
)
elif
self
.
attn_type
==
"rope"
:
...
...
fairseq/models/speech_to_text/s2t_sate.py
查看文件 @
33c67f90
...
...
@@ -333,7 +333,6 @@ class S2TSATEEncoder(FairseqEncoder):
if
"ctc_logit"
in
acoustic_encoder_out
and
len
(
acoustic_encoder_out
[
"ctc_logit"
])
>
0
:
ctc_logit
=
acoustic_encoder_out
[
"ctc_logit"
][
0
]
ctc_prob
=
F
.
softmax
(
ctc_logit
/
self
.
temperature
,
dim
=-
1
)
# ctc_prob = self.acoustic_encoder.ctc.softmax(encoder_out, self.temperature)
else
:
ctc_logit
=
None
ctc_prob
=
None
...
...
fairseq/models/transformer.py
查看文件 @
33c67f90
...
...
@@ -25,6 +25,8 @@ from fairseq.modules import (
LayerNorm
,
PositionalEmbedding
,
SinusoidalPositionalEmbedding
,
RelPositionalEncoding
,
LegacyRelPositionalEncoding
,
TransformerDecoderLayer
,
TransformerEncoderLayer
,
DynamicLinearCombination
...
...
@@ -173,7 +175,8 @@ class TransformerModel(FairseqEncoderDecoderModel):
help
=
'checkpoint activations at each layer, which saves GPU '
'memory usage at the cost of some additional compute'
)
parser
.
add_argument
(
'--offload-activations'
,
action
=
'store_true'
,
help
=
'checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations.'
)
help
=
'checkpoint activations at each layer, then save to gpu. '
'Sets --checkpoint-activations.'
)
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser
.
add_argument
(
'--no-cross-attention'
,
default
=
False
,
action
=
'store_true'
,
help
=
'do not perform cross-attention'
)
...
...
@@ -204,6 +207,8 @@ class TransformerModel(FairseqEncoderDecoderModel):
"selfattn"
,
"rel_selfattn"
,
"relative"
,
"rel_pos"
,
"rel_pos_legacy"
],
help
=
"transformer encoder self-attention layer type"
)
...
...
@@ -473,6 +478,17 @@ class TransformerEncoder(FairseqEncoder):
self
.
embed_scale
=
1.0
if
args
.
no_scale_embedding
else
math
.
sqrt
(
embed_dim
)
if
self
.
attn_type
==
"rel_pos"
:
self
.
embed_positions
=
RelPositionalEncoding
(
args
.
max_source_positions
,
args
.
encoder_embed_dim
)
elif
self
.
attn_type
in
[
"rel_selfattn"
,
"rel_pos_legacy"
]:
self
.
embed_positions
=
LegacyRelPositionalEncoding
(
args
.
encoder_embed_dim
,
args
.
dropout
,
args
.
max_source_positions
)
elif
self
.
attn_type
==
"rope"
:
self
.
embed_positions
=
None
else
:
# Use absolute positional embedding
self
.
embed_positions
=
(
PositionalEmbedding
(
args
.
max_source_positions
,
...
...
fairseq/modules/dlcl.py
查看文件 @
33c67f90
...
...
@@ -3,6 +3,8 @@ import torch.nn as nn
import
torch.nn.functional
as
F
import
numpy
as
np
from
fairseq.modules.layer_norm
import
LayerNorm
class
DynamicLinearCombination
(
nn
.
Module
):
"""Implementation of Dynamic Linear Combination of Layers (DLCL)
...
...
@@ -34,9 +36,9 @@ class DynamicLinearCombination(nn.Module):
# init triangular layer norm
if
args
.
normalize_embed
:
self
.
layer_norms
=
nn
.
ModuleList
([
nn
.
LayerNorm
(
self
.
dim
)
for
_
in
range
(
layer_num
)])
self
.
layer_norms
=
nn
.
ModuleList
([
LayerNorm
(
self
.
dim
)
for
_
in
range
(
layer_num
)])
else
:
self
.
layer_norms
=
nn
.
ModuleList
([
nn
.
Sequential
()]
+
[
nn
.
LayerNorm
(
self
.
dim
)
for
_
in
range
(
layer_num
-
1
)])
self
.
layer_norms
=
nn
.
ModuleList
([
nn
.
Sequential
()]
+
[
LayerNorm
(
self
.
dim
)
for
_
in
range
(
layer_num
-
1
)])
# states
self
.
count
=
0
...
...
@@ -165,5 +167,3 @@ class DynamicLinearCombination(nn.Module):
def
forward
(
self
):
pass
fairseq/modules/speech_to_text/subsampling.py
查看文件 @
33c67f90
...
...
@@ -142,6 +142,9 @@ class Conv1dSubsampling(nn.Module):
get_activation_class
(
act
,
dim
=
1
)
)
for
layer_id
in
range
(
num_layers
)])
out_dim
=
filters
[
-
1
]
self
.
linear
=
nn
.
Linear
(
filters
,
filters
)
def
forward
(
self
,
x
,
x_len
):
# (B, T, D) -> (B, D, T)
...
...
@@ -154,6 +157,7 @@ class Conv1dSubsampling(nn.Module):
if
x_len
is
not
None
:
x_len
=
torch
.
div
(
x_len
-
1
,
2
,
rounding_mode
=
'floor'
)
+
1
x
=
x
.
transpose
(
1
,
2
)
x
=
self
.
linear
(
x
)
return
x
,
x_len
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论