Skip to content

Commit 23066e0

Browse files
committed
update
1 parent 918e20e commit 23066e0

File tree

1 file changed

+143
-9
lines changed
  • users/zeineldeen/experiments/conformer_att_2023/swb_300/configs

1 file changed

+143
-9
lines changed

users/zeineldeen/experiments/conformer_att_2023/swb_300/configs/swb_lstm_dec.py

Lines changed: 143 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def run_train(exp_name, train_args, train_data, feature_extraction_net, num_epoc
136136
kwargs.get("returnn_root", RETURNN_ROOT),
137137
num_epochs=num_epochs,
138138
gpu_mem=kwargs.get("gpu_mem", 11),
139+
horovod_num_processes=kwargs.get("horovod_num_processes", None),
139140
)
140141
return train_job
141142

@@ -345,10 +346,11 @@ def run_search(
345346
returnn_exe=RETURNN_CPU_EXE,
346347
returnn_root=kwargs.get("returnn_root", RETURNN_ROOT),
347348
num_average=num_avg,
349+
key=kwargs.get("avg_key", "dev_score_output/output_prob"),
348350
)
349351
train_job_avg_ckpt[exp_name] = averaged_checkpoint
350352

351-
best_checkpoint = get_best_checkpoint(train_job)
353+
best_checkpoint = get_best_checkpoint(train_job, key=kwargs.get("avg_key", "dev_score_output/output_prob"))
352354
train_job_best_epoch[exp_name] = best_checkpoint
353355

354356
if recog_epochs is None:
@@ -420,6 +422,7 @@ def run_exp(
420422
train_data = build_training_datasets(
421423
bpe_size=bpe_size,
422424
use_raw_features=True,
425+
partition_epoch=kwargs.get("partition_epoch", 6),
423426
epoch_wise_filter=kwargs.get("epoch_wise_filter", None),
424427
link_speed_perturbation=train_args.get("speed_pert", False),
425428
seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"),
@@ -859,14 +862,22 @@ def get_base_v2_args(
859862
if lr_type == "epoch-oclr":
860863
lr = lr_opts["lr"]
861864
initial_lr = lr_opts.get("initial_lr", lr / 10)
862-
cyc_ep = int(0.45 * num_epochs)
865+
cyc1_factor = lr_opts.get("cyc1_factor", 0.45)
866+
cyc2_factor = lr_opts.get("cyc2_factor", 0.45)
867+
cyc1_ep = int(cyc1_factor * num_epochs)
868+
cyc2_ep = int(cyc2_factor * num_epochs)
869+
finetune_ep = num_epochs - cyc1_ep - cyc2_ep
870+
assert cyc1_ep + cyc2_ep + finetune_ep == num_epochs, "OCLR epochs do not add up."
863871
base_v2_args["learning_rates_list"] = (
864-
list(numpy.linspace(initial_lr, lr, cyc_ep))
865-
+ list(numpy.linspace(lr, initial_lr, cyc_ep))
866-
+ list(numpy.linspace(initial_lr, 1e-6, ep - 2 * cyc_ep))
872+
list(numpy.linspace(initial_lr, lr, cyc1_ep))
873+
+ list(numpy.linspace(lr, initial_lr, cyc2_ep))
874+
+ list(numpy.linspace(initial_lr, 1e-6, finetune_ep))
867875
)
868876
assert len(base_v2_args["learning_rates_list"]) == num_epochs
869877
exp_name += f"_epocOCLR-{initial_lr}-{lr}"
878+
if cyc1_factor != 0.45 or cyc2_factor != 0.45:
879+
exp_name += f"_epocOCLR-{initial_lr}-{lr}-{cyc1_factor}-{cyc2_factor}"
880+
870881
elif lr_type == "step-oclr":
871882
base_v2_args["oclr_opts"]["peak_lr"] = lr_opts["lr"]
872883
base_v2_args["oclr_opts"]["total_ep"] = num_epochs
@@ -947,8 +958,14 @@ def get_base_v2_args(
947958
bpe_size=BPE_500,
948959
)
949960

961+
# hub5e00 hub5e01 rt03s
962+
#
963+
# with pretraining: 12.4 11.1 13.0
964+
# with pretraining + disable specaug initially: 12.5 11.1 13.4
965+
# without pretraining + disable specaug initially: 12.4 11.2 13.2
966+
950967
for ep in [50 * 6]:
951-
for num_blocks, reduce_factor in [(8, 1.0)]:
968+
for num_blocks, reduce_factor in [(8, 1.0), (12, 1.0)]:
952969
# TODO: smaller target embed dim
953970
args, name = get_base_v2_args(ep, num_blocks, reduce_factor, lr_type="epoch-oclr", lr_opts={"lr": 1e-3})
954971
args["specaug_version"] = 1
@@ -961,6 +978,66 @@ def get_base_v2_args(
961978
bpe_size=BPE_500,
962979
)
963980

981+
# TODO: pretrain + step-based oclr
982+
# note that batch size is larger during pretrain so the oclr here is not symmetric
983+
args, name = get_base_v2_args(
984+
ep, num_blocks, reduce_factor, lr_type="step-oclr", lr_opts={"lr": 1e-3, "n_step": 1450}
985+
)
986+
args["specaug_version"] = 1
987+
args["decoder_args"].embed_dim = 256
988+
run_default_exp(
989+
name + f"_embed256_specaug1",
990+
train_args=args,
991+
num_epochs=ep,
992+
gpu_mem=11,
993+
bpe_size=BPE_500,
994+
)
995+
996+
# TODO: no pretrain
997+
for oclr_n_step in [None, 1450]:
998+
if num_blocks == 12:
999+
continue
1000+
if oclr_n_step is None and num_blocks == 8:
1001+
continue
1002+
if oclr_n_step is not None:
1003+
# step-based
1004+
args, name = get_base_v2_args(
1005+
ep, num_blocks, reduce_factor, lr_type="step-oclr", lr_opts={"lr": 1e-3, "n_step": oclr_n_step}
1006+
)
1007+
else:
1008+
# epoch-based
1009+
args, name = get_base_v2_args(
1010+
ep, num_blocks, reduce_factor, lr_type="epoch-oclr", lr_opts={"lr": 1e-3}
1011+
)
1012+
1013+
if num_blocks == 8:
1014+
specaug_steps = {"step0": 6_000, "step1": 8_000, "step2": 10_000}
1015+
elif num_blocks == 12:
1016+
specaug_steps = {"step0": 10_000, "step1": 15_000, "step2": 20_000}
1017+
else:
1018+
raise NotImplementedError
1019+
1020+
args["specaug_str_func_opts"] = {
1021+
"version": 2,
1022+
**specaug_steps,
1023+
"max_time_num": 100,
1024+
"max_time_dim": 20,
1025+
"min_num_add_factor": 0,
1026+
"freq_dim_factor": 5,
1027+
}
1028+
args["decoder_args"].embed_dim = 256
1029+
args["with_pretrain"] = False
1030+
run_default_exp(
1031+
name + f"_embed256_specaugCurrV1_noPretrain",
1032+
train_args=args,
1033+
num_epochs=ep,
1034+
gpu_mem=11,
1035+
bpe_size=BPE_500,
1036+
)
1037+
1038+
# TODO: mixup
1039+
for ep in [50 * 6]:
1040+
for num_blocks, reduce_factor in [(8, 1.0)]:
9641041
for apply_drop in [0.1, 0.2, 0.3, 0.4]:
9651042
for max_num_mix in [4, 5]:
9661043
for lambda_min_max in [(0.15, 0.3), (0.1, 0.3)]:
@@ -1037,6 +1114,63 @@ def get_base_v2_args(
10371114
# TODO: staged hyperparams
10381115
# - weight noise: disable for first 45% of epochs for example and enable it later
10391116
# - apply curriculum learning for utterances?
1040-
# - grad clip: /4, /2, /1
1041-
# - schedule sampling?
1042-
# - label smoothing?
1117+
1118+
# TODO: no ctc
1119+
for ep in [50 * 6]:
1120+
for num_blocks, reduce_factor in [(8, 1.0)]:
1121+
# TODO: smaller target embed dim
1122+
args, name = get_base_v2_args(ep, num_blocks, reduce_factor, lr_type="epoch-oclr", lr_opts={"lr": 1e-3})
1123+
args["specaug_version"] = 1
1124+
args["decoder_args"].embed_dim = 256
1125+
args["encoder_args"].with_ctc = False
1126+
run_default_exp(
1127+
name + f"_embed256_specaug1_noCTC",
1128+
train_args=args,
1129+
num_epochs=ep,
1130+
gpu_mem=11,
1131+
bpe_size=BPE_500,
1132+
avg_key="dev_score",
1133+
)
1134+
1135+
# TODO: multi-gpu training
1136+
# conf_8l_dimF1.0_bpe500_drop0.1_selfAttDrop0.15_decDrop0.2_embedDrop0.05_wd0.0_ep300_epocOCLR-0.0001-0.001_embed256_specaug1
1137+
# 12.4 11.1 13 avg
1138+
1139+
# TODO: param sync
1140+
# gpu4_paramSync_step50_accum1_gradClipNorm5 13.7 12.1 14.5 avg
1141+
# gpu4_paramSync_step100_accum1_gradClipNorm20 13.8 12.3 14.5 avg
1142+
# gpu4_paramSync_step100_accum1_gradClipNorm5 14 12.1 14.6 avg
1143+
for ep in [50 * 6]:
1144+
for num_blocks, reduce_factor in [(8, 1.0)]:
1145+
for sync_step in [50]:
1146+
for gradient_clip_global_norm in [5]:
1147+
for lr_opts in [
1148+
{"lr": 1e-3},
1149+
{"lr": 1e-3, "initial_lr": 4e-4}, # higher initial LR
1150+
{"lr": 1e-3, "cyc1_factor": 0.5, "cyc2_factor": 0.5}, # no fine-tuning
1151+
{"lr": 1e-3, "cyc1_factor": 0.2, "cyc2_factor": 0.7}, # shorted warmup
1152+
]:
1153+
args, name = get_base_v2_args(
1154+
ep, num_blocks, reduce_factor, lr_type="epoch-oclr", lr_opts=lr_opts
1155+
)
1156+
args["accum_grad"] = 1
1157+
args["specaug_version"] = 1
1158+
args["decoder_args"].embed_dim = 256
1159+
args["horovod_params"] = {
1160+
"horovod_reduce_type": "param",
1161+
"horovod_param_sync_step": sync_step,
1162+
"horovod_dataset_distribution": "random_seed_offset",
1163+
}
1164+
exp_name = name + f"_embed256_specaug1_gpu4_paramSync_step{sync_step}_accum1"
1165+
if gradient_clip_global_norm:
1166+
args["gradient_clip_global_norm"] = gradient_clip_global_norm
1167+
exp_name += f"_gradClipNorm{gradient_clip_global_norm}"
1168+
run_default_exp(
1169+
exp_name,
1170+
train_args=args,
1171+
num_epochs=ep,
1172+
partition_epoch=6 * 4,
1173+
gpu_mem=11,
1174+
bpe_size=BPE_500,
1175+
horovod_num_processes=4,
1176+
)

0 commit comments

Comments
 (0)