Commit 03ced14d by libei

revise the conver model script, now we use conver_t2t_to_fairseq.py to convert…

revise the conver model script,  now we use conver_t2t_to_fairseq.py to convert the t2t checkpoint into fairseq.pt
parent d3205fc9
#! /usr/bin/bash
set -e
# device, you can set multiple devices, e.g. device=(0 1 2)
# then program will parallelly translate over various evalset (e.g. evalset=(cwmt18-dev mt06 mt08), or over various alpha (e.g. alphas=(1.0 1.1 1.2).
# However, note that multiple evalset and multiple alpha can not set concurrently.
# more device will not be used. e.g. you set device=(0 1 2 3), but you only choose three evalset, the gpu=3 will not be used
device=(0 1 2 3 4 5 6 7)
# your model
model=dense_transformer
# your hparams
params=dense_transformer_base
# your tag, must set!
tag=dense16
model_dir=t2tmodel/$tag/ensemble15
output_dir=checkpoints/$tag
if [ ! -d "$output_dir" ]; then
mkdir -p $output_dir
fi
n_head=8
result=$(echo $params | sed -n '/base/'p)
if [[ "$result" != "" ]]; then
n_head=8
else
n_head=16
fi
echo "n_head=$n_head"
CUDA_VISIBLE_DEVICES=0 python3 scripts/convert_dense_to_fairseq.py -model $model_dir/ensemble_15-0 -src_vocab t2tmodel/source_dic -tgt_vocab t2tmodel/target_dic -head_num $n_head -vocab_output $model_dir/fairseq.vocab -model_output $output_dir/fairseq.pt
rm online.vocab
......@@ -6,10 +6,6 @@ set -e
# However, note that multiple evalset and multiple alpha can not set concurrently.
# more device will not be used. e.g. you set device=(0 1 2 3), but you only choose three evalset, the gpu=3 will not be used
device=(0 1 2 3 4 5 6 7)
# your model
model=transformer
# your hparams
params=transformer_big
# your tag, must set!
tag=big_v3_multistep4
......
......@@ -28,8 +28,6 @@ def find_useful_param(model_file):
global activation_function
global use_relative_position_representation
global max_relative_length
global normalize_before
global use_dense
trainable_param = dict()
try:
......@@ -75,16 +73,6 @@ def find_useful_param(model_file):
use_relative_position_representation = 1
max_relative_length = (var_to_shape_map[key][0] - 1) // 2
pattern = re.compile('norm_top/layer_norm_bias')
match = re.search(pattern, key)
if match and normalize_before is False:
normalize_before = True
pattern = re.compile('/layer_history/layer_weight')
match = re.search(pattern, key)
if match and use_dense is False:
use_dense = True
except Exception as e: # pylint: disable=broad-except
print(str(e))
assert len(trainable_param) > 0, "not found any trainable parameters"
......@@ -99,9 +87,9 @@ def find_useful_param(model_file):
if use_relative_position_representation is None:
use_relative_position_representation = 0
print('find src-vocab:{} tgt-vocab:{} emb-size:{} src-layer:{} tgt-layer:{} activation:{} relative:{} normalize_before:{} use_dense:{}'.
print('find src-vocab:{} tgt-vocab:{} emb-size:{} src-layer:{} tgt-layer:{} activation:{} relative:{}'.
format(src_vocab_size, tgt_vocab_size, emb_size, src_layer_num, tgt_layer_num,
activation_function, use_relative_position_representation, normalize_before, use_dense))
activation_function, use_relative_position_representation))
......@@ -160,7 +148,7 @@ def load_param():
# However, in practice, we write the pos_emb in both source and target sides
# This is convinient for us to init encoder and decoder independently
param_dict['src_pos_emb'] = get_pos_emb(src_max_length, emb_size=emb_size)
print(sess.run(param_dict['src_pos_emb']))
# encoder
for layer_id in range(src_layer_num):
# step 1. qvk_trans_w shape is H * 3H (because query, key, value is same); qvk_trans_b shape is 3H
......@@ -209,26 +197,6 @@ def load_param():
param_dict['enc_%d_ln_scale_2' % layer_id] = tf.reshape(layer_normal_scale_2, [1, -1])
param_dict['enc_%d_ln_bias_2' % layer_id] = tf.reshape(layer_normal_bias_2, [1, -1])
if use_dense:
# step 9. layer normalization dense
layer_normal_bias_dense = _get_tensor('body/encoder/layer_history/layer_norm%d/layer_norm_bias' % layer_id)
layer_normal_scale_dense = _get_tensor('body/encoder/layer_history/layer_norm%d/layer_norm_scale' % layer_id)
param_dict['enc_%d_ln_scale_dense' % layer_id] = tf.reshape(layer_normal_scale_dense, [1, -1])
param_dict['enc_%d_ln_bias_dense' % layer_id] = tf.reshape(layer_normal_bias_dense, [1, -1])
# step 7. layer normalization top
if normalize_before:
layer_normal_bias_top = _get_tensor('body/encoder/norm_top/layer_norm_bias')
layer_normal_scale_top = _get_tensor('body/encoder/norm_top/layer_norm_scale')
param_dict['enc_ln_bias_top'] = tf.reshape(layer_normal_bias_top, [1, -1])
param_dict['enc_ln_scale_top'] = tf.reshape(layer_normal_scale_top, [1, -1])
# step 8. dense transformer weight matrix
if use_dense:
layer_weight = _get_tensor('body/encoder/layer_history/layer_weight')
print(type(layer_weight))
param_dict['enc_layer_weight'] = layer_weight
# decoder
# target embedding, shape is tgt_vocab * emb_size
tgt_emb_tensor = _get_tensor('symbol_modality_{}_{}/target_emb/weights'.format(tgt_vocab_size, emb_size),
......@@ -314,31 +282,24 @@ def load_param():
param_dict['dec_%d_encdecatt_ln_scale_2' % layer_id] = tf.reshape(encdec_layer_normal_scale_2, [1, -1])
param_dict['dec_%d_encdecatt_ln_bias_2' % layer_id] = tf.reshape(encdec_layer_normal_bias_2, [1, -1])
if use_dense:
# step 9. layer normalization dense
layer_normal_bias_dense = _get_tensor('body/decoder/layer_history/layer_norm%d/layer_norm_bias' % layer_id)
layer_normal_scale_dense = _get_tensor('body/decoder/layer_history/layer_norm%d/layer_norm_scale' % layer_id)
param_dict['dec_%d_ln_scale_dense' % layer_id] = tf.reshape(layer_normal_scale_dense, [1, -1])
param_dict['dec_%d_ln_bias_dense' % layer_id] = tf.reshape(layer_normal_bias_dense, [1, -1])
# step 7. layer normalization top
if normalize_before:
layer_normal_bias_top = _get_tensor('body/decoder/norm_top/layer_norm_bias')
layer_normal_scale_top = _get_tensor('body/decoder/norm_top/layer_norm_scale')
param_dict['dec_ln_bias_top'] = tf.reshape(layer_normal_bias_top, [1, -1])
param_dict['dec_ln_scale_top'] = tf.reshape(layer_normal_scale_top, [1, -1])
# step 8. dense transformer weight matrix
if use_dense:
layer_weight = _get_tensor('body/decoder/layer_history/layer_weight')
param_dict['dec_layer_weight'] = layer_weight
softmax_w_tensor = _get_tensor('symbol_modality_{}_{}/softmax/weights'.format(tgt_vocab_size, emb_size),
shard=shard)
# note: we transpose the matrix, from (V,H) to (H,V)
param_dict['softmax_w'] = softmax_w_tensor
return param_dict
def _get_param_numpy_bak(name, sess, transpose=False):
assert name in param_dict, "unknown param nam:{} in dict".format(name)
value = param_dict[name]
if transpose:
value = tf.transpose(value,[1, 0])
if type(value) is tf.Tensor:
value = sess.run(value)
value = torch.from_numpy(np.array(value)).float()
if value.dim() == 2 and value.size(1) == 1:
value = value.squeeze(1)
return value
def _get_param_numpy(name, sess, transpose=False):
assert name in param_dict, "unknown param nam:{} in dict".format(name)
value = param_dict[name]
......@@ -352,9 +313,9 @@ def _get_param_numpy(name, sess, transpose=False):
if value.dim() == 2 and value.size(1) == 1:
value = value.squeeze(1)
print('%s size=%s' % (name, str(value.size())))
return value
def write_vocab():
print('write vocab file ...')
vocab_file = open('online.vocab', 'w')
......@@ -393,13 +354,8 @@ def convert_settings(settings):
args['decoder_ffn_embed_dim'] = int(settings['inner_hidden_size'])
args['encoder_attention_heads'] = int(settings['head_num'])
args['decoder_attention_heads'] = int(settings['head_num'])
if use_dense:
args['arch'] = 'dense_transformer'
args['encoder_history_type'] = 'learnable_dense'
args['decoder_history_type'] = 'learnable_dense'
args['encoder_integration_type'] = 'avg'
args['decoder_integration_type'] = 'avg'
# args[''] = settings['src_max_length']
# args[''] = settings['tgt_max_length']
# const params
args['encoder_learned_pos'] = True
......@@ -415,13 +371,6 @@ def convert_settings(settings):
args['dropout'] = 0.1
args['attention_dropout'] = 0.0
args['relu_dropout'] = 0.0
if normalize_before:
args['encoder_normalize_before'] = True
args['decoder_normalize_before'] = True
args['attention_dropout'] = 0.1
args['relu_dropout'] = 0.1
return argparse.Namespace(**args)
......@@ -440,8 +389,6 @@ def creat_settings():
settings["activation"] = str(activation_function)
settings["use_rpr"] = str(use_relative_position_representation)
settings["max_relative_length"] = str(max_relative_length)
settings["normalize_before"] = str(normalize_before)
settings["use_dense"] = str(use_dense)
return settings
......@@ -497,22 +444,6 @@ def convert_param():
model['%s.1.weight' % p1] = _get_param_numpy('enc_%d_ln_scale_2' % p2, sess, transpose=True)
model['%s.1.bias' % p1] = _get_param_numpy('enc_%d_ln_bias_2' % p2, sess, transpose=True)
if use_dense:
p1 = 'encoder.history.layer_norms.%d' % layer_id
model['%s.weight' % p1] = _get_param_numpy('enc_%d_ln_scale_dense' % p2, sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('enc_%d_ln_bias_dense' % p2, sess, transpose=True)
if normalize_before:
p1 = 'encoder.layer_norm'
model['%s.weight' % p1] = _get_param_numpy('enc_ln_scale_top', sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('enc_ln_bias_top', sess, transpose=True)
if use_dense:
enc_layer_weight = _get_param_numpy('enc_layer_weight', sess)
scale = torch.sum(enc_layer_weight, dim=1, keepdim=True)
enc_layer_weight = enc_layer_weight / scale
model['encoder.history.weight'] = enc_layer_weight
embed = _get_param_numpy('tgt_emb', sess)
"""
# <PAD> <EOS> <UNK> <Lua> -> <Lua> <PAD> <EOS> <UNK>
......@@ -569,22 +500,6 @@ def convert_param():
model['%s.2.weight' % p1] = _get_param_numpy('dec_%d_encdecatt_ln_scale_2' % p2, sess, transpose=True)
model['%s.2.bias' % p1] = _get_param_numpy('dec_%d_encdecatt_ln_bias_2' % p2, sess, transpose=True)
if use_dense:
p1 = 'decoder.history.layer_norms.%d' % layer_id
model['%s.weight' % p1] = _get_param_numpy('dec_%d_ln_scale_dense' % p2, sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('dec_%d_ln_bias_dense' % p2, sess, transpose=True)
if normalize_before:
p1 = 'decoder.layer_norm'
model['%s.weight' % p1] = _get_param_numpy('dec_ln_scale_top', sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('dec_ln_bias_top', sess, transpose=True)
if use_dense:
dec_layer_weight = _get_param_numpy('dec_layer_weight', sess)
scale = torch.sum(dec_layer_weight, dim=1, keepdim=True)
dec_layer_weight = dec_layer_weight / scale
model['decoder.history.weight'] = dec_layer_weight
softmax_w_tensor = _get_param_numpy('softmax_w', sess)
"""
pad_row = softmax_w_tensor[0, :].unsqueeze(0)
......@@ -688,8 +603,6 @@ if __name__ == '__main__':
activation_function = None
use_relative_position_representation = None
max_relative_length = -1
normalize_before = False
use_dense = False
start = time.time()
......@@ -707,10 +620,9 @@ if __name__ == '__main__':
ck_state = {}
ck_state['args'] = convert_settings(settings)
print(ck_state)
ck_state['model'] = convert_param()
print(ck_state['args'])
# print(ck_state['model'].keys())
write_vocab()
convert_dict('online.vocab', args.vocab_output)
......
......@@ -28,6 +28,8 @@ def find_useful_param(model_file):
global activation_function
global use_relative_position_representation
global max_relative_length
global normalize_before
global use_dense
trainable_param = dict()
try:
......@@ -73,6 +75,16 @@ def find_useful_param(model_file):
use_relative_position_representation = 1
max_relative_length = (var_to_shape_map[key][0] - 1) // 2
pattern = re.compile('norm_top/layer_norm_bias')
match = re.search(pattern, key)
if match and normalize_before is False:
normalize_before = True
pattern = re.compile('/layer_history/layer_weight')
match = re.search(pattern, key)
if match and use_dense is False:
use_dense = True
except Exception as e: # pylint: disable=broad-except
print(str(e))
assert len(trainable_param) > 0, "not found any trainable parameters"
......@@ -87,9 +99,9 @@ def find_useful_param(model_file):
if use_relative_position_representation is None:
use_relative_position_representation = 0
print('find src-vocab:{} tgt-vocab:{} emb-size:{} src-layer:{} tgt-layer:{} activation:{} relative:{}'.
print('find src-vocab:{} tgt-vocab:{} emb-size:{} src-layer:{} tgt-layer:{} activation:{} relative:{} normalize_before:{} use_dense:{}'.
format(src_vocab_size, tgt_vocab_size, emb_size, src_layer_num, tgt_layer_num,
activation_function, use_relative_position_representation))
activation_function, use_relative_position_representation, normalize_before, use_dense))
......@@ -148,7 +160,7 @@ def load_param():
# However, in practice, we write the pos_emb in both source and target sides
# This is convinient for us to init encoder and decoder independently
param_dict['src_pos_emb'] = get_pos_emb(src_max_length, emb_size=emb_size)
print(sess.run(param_dict['src_pos_emb']))
# encoder
for layer_id in range(src_layer_num):
# step 1. qvk_trans_w shape is H * 3H (because query, key, value is same); qvk_trans_b shape is 3H
......@@ -197,6 +209,26 @@ def load_param():
param_dict['enc_%d_ln_scale_2' % layer_id] = tf.reshape(layer_normal_scale_2, [1, -1])
param_dict['enc_%d_ln_bias_2' % layer_id] = tf.reshape(layer_normal_bias_2, [1, -1])
if use_dense:
# step 9. layer normalization dense
layer_normal_bias_dense = _get_tensor('body/encoder/layer_history/layer_norm%d/layer_norm_bias' % layer_id)
layer_normal_scale_dense = _get_tensor('body/encoder/layer_history/layer_norm%d/layer_norm_scale' % layer_id)
param_dict['enc_%d_ln_scale_dense' % layer_id] = tf.reshape(layer_normal_scale_dense, [1, -1])
param_dict['enc_%d_ln_bias_dense' % layer_id] = tf.reshape(layer_normal_bias_dense, [1, -1])
# step 7. layer normalization top
if normalize_before:
layer_normal_bias_top = _get_tensor('body/encoder/norm_top/layer_norm_bias')
layer_normal_scale_top = _get_tensor('body/encoder/norm_top/layer_norm_scale')
param_dict['enc_ln_bias_top'] = tf.reshape(layer_normal_bias_top, [1, -1])
param_dict['enc_ln_scale_top'] = tf.reshape(layer_normal_scale_top, [1, -1])
# step 8. dense transformer weight matrix
if use_dense:
layer_weight = _get_tensor('body/encoder/layer_history/layer_weight')
print(type(layer_weight))
param_dict['enc_layer_weight'] = layer_weight
# decoder
# target embedding, shape is tgt_vocab * emb_size
tgt_emb_tensor = _get_tensor('symbol_modality_{}_{}/target_emb/weights'.format(tgt_vocab_size, emb_size),
......@@ -282,24 +314,31 @@ def load_param():
param_dict['dec_%d_encdecatt_ln_scale_2' % layer_id] = tf.reshape(encdec_layer_normal_scale_2, [1, -1])
param_dict['dec_%d_encdecatt_ln_bias_2' % layer_id] = tf.reshape(encdec_layer_normal_bias_2, [1, -1])
if use_dense:
# step 9. layer normalization dense
layer_normal_bias_dense = _get_tensor('body/decoder/layer_history/layer_norm%d/layer_norm_bias' % layer_id)
layer_normal_scale_dense = _get_tensor('body/decoder/layer_history/layer_norm%d/layer_norm_scale' % layer_id)
param_dict['dec_%d_ln_scale_dense' % layer_id] = tf.reshape(layer_normal_scale_dense, [1, -1])
param_dict['dec_%d_ln_bias_dense' % layer_id] = tf.reshape(layer_normal_bias_dense, [1, -1])
# step 7. layer normalization top
if normalize_before:
layer_normal_bias_top = _get_tensor('body/decoder/norm_top/layer_norm_bias')
layer_normal_scale_top = _get_tensor('body/decoder/norm_top/layer_norm_scale')
param_dict['dec_ln_bias_top'] = tf.reshape(layer_normal_bias_top, [1, -1])
param_dict['dec_ln_scale_top'] = tf.reshape(layer_normal_scale_top, [1, -1])
# step 8. dense transformer weight matrix
if use_dense:
layer_weight = _get_tensor('body/decoder/layer_history/layer_weight')
param_dict['dec_layer_weight'] = layer_weight
softmax_w_tensor = _get_tensor('symbol_modality_{}_{}/softmax/weights'.format(tgt_vocab_size, emb_size),
shard=shard)
# note: we transpose the matrix, from (V,H) to (H,V)
param_dict['softmax_w'] = softmax_w_tensor
return param_dict
def _get_param_numpy_bak(name, sess, transpose=False):
assert name in param_dict, "unknown param nam:{} in dict".format(name)
value = param_dict[name]
if transpose:
value = tf.transpose(value,[1, 0])
if type(value) is tf.Tensor:
value = sess.run(value)
value = torch.from_numpy(np.array(value)).float()
if value.dim() == 2 and value.size(1) == 1:
value = value.squeeze(1)
return value
def _get_param_numpy(name, sess, transpose=False):
assert name in param_dict, "unknown param nam:{} in dict".format(name)
value = param_dict[name]
......@@ -313,9 +352,9 @@ def _get_param_numpy(name, sess, transpose=False):
if value.dim() == 2 and value.size(1) == 1:
value = value.squeeze(1)
print('%s size=%s' % (name, str(value.size())))
return value
def write_vocab():
print('write vocab file ...')
vocab_file = open('online.vocab', 'w')
......@@ -354,8 +393,13 @@ def convert_settings(settings):
args['decoder_ffn_embed_dim'] = int(settings['inner_hidden_size'])
args['encoder_attention_heads'] = int(settings['head_num'])
args['decoder_attention_heads'] = int(settings['head_num'])
# args[''] = settings['src_max_length']
# args[''] = settings['tgt_max_length']
if use_dense:
args['arch'] = 'dense_transformer'
args['encoder_history_type'] = 'learnable_dense'
args['decoder_history_type'] = 'learnable_dense'
args['encoder_integration_type'] = 'avg'
args['decoder_integration_type'] = 'avg'
# const params
args['encoder_learned_pos'] = True
......@@ -371,6 +415,13 @@ def convert_settings(settings):
args['dropout'] = 0.1
args['attention_dropout'] = 0.0
args['relu_dropout'] = 0.0
if normalize_before:
args['encoder_normalize_before'] = True
args['decoder_normalize_before'] = True
args['attention_dropout'] = 0.1
args['relu_dropout'] = 0.1
return argparse.Namespace(**args)
......@@ -389,6 +440,8 @@ def creat_settings():
settings["activation"] = str(activation_function)
settings["use_rpr"] = str(use_relative_position_representation)
settings["max_relative_length"] = str(max_relative_length)
settings["normalize_before"] = str(normalize_before)
settings["use_dense"] = str(use_dense)
return settings
......@@ -444,6 +497,22 @@ def convert_param():
model['%s.1.weight' % p1] = _get_param_numpy('enc_%d_ln_scale_2' % p2, sess, transpose=True)
model['%s.1.bias' % p1] = _get_param_numpy('enc_%d_ln_bias_2' % p2, sess, transpose=True)
if use_dense:
p1 = 'encoder.history.layer_norms.%d' % layer_id
model['%s.weight' % p1] = _get_param_numpy('enc_%d_ln_scale_dense' % p2, sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('enc_%d_ln_bias_dense' % p2, sess, transpose=True)
if normalize_before:
p1 = 'encoder.layer_norm'
model['%s.weight' % p1] = _get_param_numpy('enc_ln_scale_top', sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('enc_ln_bias_top', sess, transpose=True)
if use_dense:
enc_layer_weight = _get_param_numpy('enc_layer_weight', sess)
scale = torch.sum(enc_layer_weight, dim=1, keepdim=True)
enc_layer_weight = enc_layer_weight / scale
model['encoder.history.weight'] = enc_layer_weight
embed = _get_param_numpy('tgt_emb', sess)
"""
# <PAD> <EOS> <UNK> <Lua> -> <Lua> <PAD> <EOS> <UNK>
......@@ -500,6 +569,22 @@ def convert_param():
model['%s.2.weight' % p1] = _get_param_numpy('dec_%d_encdecatt_ln_scale_2' % p2, sess, transpose=True)
model['%s.2.bias' % p1] = _get_param_numpy('dec_%d_encdecatt_ln_bias_2' % p2, sess, transpose=True)
if use_dense:
p1 = 'decoder.history.layer_norms.%d' % layer_id
model['%s.weight' % p1] = _get_param_numpy('dec_%d_ln_scale_dense' % p2, sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('dec_%d_ln_bias_dense' % p2, sess, transpose=True)
if normalize_before:
p1 = 'decoder.layer_norm'
model['%s.weight' % p1] = _get_param_numpy('dec_ln_scale_top', sess, transpose=True)
model['%s.bias' % p1] = _get_param_numpy('dec_ln_bias_top', sess, transpose=True)
if use_dense:
dec_layer_weight = _get_param_numpy('dec_layer_weight', sess)
scale = torch.sum(dec_layer_weight, dim=1, keepdim=True)
dec_layer_weight = dec_layer_weight / scale
model['decoder.history.weight'] = dec_layer_weight
softmax_w_tensor = _get_param_numpy('softmax_w', sess)
"""
pad_row = softmax_w_tensor[0, :].unsqueeze(0)
......@@ -603,6 +688,8 @@ if __name__ == '__main__':
activation_function = None
use_relative_position_representation = None
max_relative_length = -1
normalize_before = False
use_dense = False
start = time.time()
......@@ -620,9 +707,10 @@ if __name__ == '__main__':
ck_state = {}
ck_state['args'] = convert_settings(settings)
print(ck_state)
ck_state['model'] = convert_param()
print(ck_state['args'])
# print(ck_state['model'].keys())
write_vocab()
convert_dict('online.vocab', args.vocab_output)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论