@@ -136,6 +136,7 @@ def run_train(exp_name, train_args, train_data, feature_extraction_net, num_epoc
136136 kwargs .get ("returnn_root" , RETURNN_ROOT ),
137137 num_epochs = num_epochs ,
138138 gpu_mem = kwargs .get ("gpu_mem" , 11 ),
139+ horovod_num_processes = kwargs .get ("horovod_num_processes" , None ),
139140 )
140141 return train_job
141142
@@ -345,10 +346,11 @@ def run_search(
345346 returnn_exe = RETURNN_CPU_EXE ,
346347 returnn_root = kwargs .get ("returnn_root" , RETURNN_ROOT ),
347348 num_average = num_avg ,
349+ key = kwargs .get ("avg_key" , "dev_score_output/output_prob" ),
348350 )
349351 train_job_avg_ckpt [exp_name ] = averaged_checkpoint
350352
351- best_checkpoint = get_best_checkpoint (train_job )
353+ best_checkpoint = get_best_checkpoint (train_job , key = kwargs . get ( "avg_key" , "dev_score_output/output_prob" ) )
352354 train_job_best_epoch [exp_name ] = best_checkpoint
353355
354356 if recog_epochs is None :
@@ -420,6 +422,7 @@ def run_exp(
420422 train_data = build_training_datasets (
421423 bpe_size = bpe_size ,
422424 use_raw_features = True ,
425+ partition_epoch = kwargs .get ("partition_epoch" , 6 ),
423426 epoch_wise_filter = kwargs .get ("epoch_wise_filter" , None ),
424427 link_speed_perturbation = train_args .get ("speed_pert" , False ),
425428 seq_ordering = kwargs .get ("seq_ordering" , "laplace:.1000" ),
@@ -859,14 +862,22 @@ def get_base_v2_args(
859862 if lr_type == "epoch-oclr" :
860863 lr = lr_opts ["lr" ]
861864 initial_lr = lr_opts .get ("initial_lr" , lr / 10 )
862- cyc_ep = int (0.45 * num_epochs )
865+ cyc1_factor = lr_opts .get ("cyc1_factor" , 0.45 )
866+ cyc2_factor = lr_opts .get ("cyc2_factor" , 0.45 )
867+ cyc1_ep = int (cyc1_factor * num_epochs )
868+ cyc2_ep = int (cyc2_factor * num_epochs )
869+ finetune_ep = num_epochs - cyc1_ep - cyc2_ep
870+ assert cyc1_ep + cyc2_ep + finetune_ep == num_epochs , "OCLR epochs do not add up."
863871 base_v2_args ["learning_rates_list" ] = (
864- list (numpy .linspace (initial_lr , lr , cyc_ep ))
865- + list (numpy .linspace (lr , initial_lr , cyc_ep ))
866- + list (numpy .linspace (initial_lr , 1e-6 , ep - 2 * cyc_ep ))
872+ list (numpy .linspace (initial_lr , lr , cyc1_ep ))
873+ + list (numpy .linspace (lr , initial_lr , cyc2_ep ))
874+ + list (numpy .linspace (initial_lr , 1e-6 , finetune_ep ))
867875 )
868876 assert len (base_v2_args ["learning_rates_list" ]) == num_epochs
869877 exp_name += f"_epocOCLR-{ initial_lr } -{ lr } "
878+ if cyc1_factor != 0.45 or cyc2_factor != 0.45 :
879+ exp_name += f"_epocOCLR-{ initial_lr } -{ lr } -{ cyc1_factor } -{ cyc2_factor } "
880+
870881 elif lr_type == "step-oclr" :
871882 base_v2_args ["oclr_opts" ]["peak_lr" ] = lr_opts ["lr" ]
872883 base_v2_args ["oclr_opts" ]["total_ep" ] = num_epochs
@@ -947,8 +958,14 @@ def get_base_v2_args(
947958 bpe_size = BPE_500 ,
948959 )
949960
961+ # hub5e00 hub5e01 rt03s
962+ #
963+ # with pretraining: 12.4 11.1 13.0
964+ # with pretraining + disable specaug initially: 12.5 11.1 13.4
965+ # without pretraining + disable specaug initially: 12.4 11.2 13.2
966+
950967 for ep in [50 * 6 ]:
951- for num_blocks , reduce_factor in [(8 , 1.0 )]:
968+ for num_blocks , reduce_factor in [(8 , 1.0 ), ( 12 , 1.0 ) ]:
952969 # TODO: smaller target embed dim
953970 args , name = get_base_v2_args (ep , num_blocks , reduce_factor , lr_type = "epoch-oclr" , lr_opts = {"lr" : 1e-3 })
954971 args ["specaug_version" ] = 1
@@ -961,6 +978,66 @@ def get_base_v2_args(
961978 bpe_size = BPE_500 ,
962979 )
963980
981+ # TODO: pretrain + step-based oclr
982+ # note that batch size is larger during pretrain so the oclr here is not symmetric
983+ args , name = get_base_v2_args (
984+ ep , num_blocks , reduce_factor , lr_type = "step-oclr" , lr_opts = {"lr" : 1e-3 , "n_step" : 1450 }
985+ )
986+ args ["specaug_version" ] = 1
987+ args ["decoder_args" ].embed_dim = 256
988+ run_default_exp (
989+ name + f"_embed256_specaug1" ,
990+ train_args = args ,
991+ num_epochs = ep ,
992+ gpu_mem = 11 ,
993+ bpe_size = BPE_500 ,
994+ )
995+
996+ # TODO: no pretrain
997+ for oclr_n_step in [None , 1450 ]:
998+ if num_blocks == 12 :
999+ continue
1000+ if oclr_n_step is None and num_blocks == 8 :
1001+ continue
1002+ if oclr_n_step is not None :
1003+ # step-based
1004+ args , name = get_base_v2_args (
1005+ ep , num_blocks , reduce_factor , lr_type = "step-oclr" , lr_opts = {"lr" : 1e-3 , "n_step" : oclr_n_step }
1006+ )
1007+ else :
1008+ # epoch-based
1009+ args , name = get_base_v2_args (
1010+ ep , num_blocks , reduce_factor , lr_type = "epoch-oclr" , lr_opts = {"lr" : 1e-3 }
1011+ )
1012+
1013+ if num_blocks == 8 :
1014+ specaug_steps = {"step0" : 6_000 , "step1" : 8_000 , "step2" : 10_000 }
1015+ elif num_blocks == 12 :
1016+ specaug_steps = {"step0" : 10_000 , "step1" : 15_000 , "step2" : 20_000 }
1017+ else :
1018+ raise NotImplementedError
1019+
1020+ args ["specaug_str_func_opts" ] = {
1021+ "version" : 2 ,
1022+ ** specaug_steps ,
1023+ "max_time_num" : 100 ,
1024+ "max_time_dim" : 20 ,
1025+ "min_num_add_factor" : 0 ,
1026+ "freq_dim_factor" : 5 ,
1027+ }
1028+ args ["decoder_args" ].embed_dim = 256
1029+ args ["with_pretrain" ] = False
1030+ run_default_exp (
1031+ name + f"_embed256_specaugCurrV1_noPretrain" ,
1032+ train_args = args ,
1033+ num_epochs = ep ,
1034+ gpu_mem = 11 ,
1035+ bpe_size = BPE_500 ,
1036+ )
1037+
1038+ # TODO: mixup
1039+ for ep in [50 * 6 ]:
1040+ for num_blocks , reduce_factor in [(8 , 1.0 )]:
9641041 for apply_drop in [0.1 , 0.2 , 0.3 , 0.4 ]:
9651042 for max_num_mix in [4 , 5 ]:
9661043 for lambda_min_max in [(0.15 , 0.3 ), (0.1 , 0.3 )]:
@@ -1037,6 +1114,63 @@ def get_base_v2_args(
10371114 # TODO: staged hyperparams
10381115 # - weight noise: disable for first 45% of epochs for example and enable it later
10391116 # - apply curriculum learning for utterances?
1040- # - grad clip: /4, /2, /1
1041- # - schedule sampling?
1042- # - label smoothing?
1117+
1118+ # TODO: no ctc
1119+ for ep in [50 * 6 ]:
1120+ for num_blocks , reduce_factor in [(8 , 1.0 )]:
1121+ # TODO: smaller target embed dim
1122+ args , name = get_base_v2_args (ep , num_blocks , reduce_factor , lr_type = "epoch-oclr" , lr_opts = {"lr" : 1e-3 })
1123+ args ["specaug_version" ] = 1
1124+ args ["decoder_args" ].embed_dim = 256
1125+ args ["encoder_args" ].with_ctc = False
1126+ run_default_exp (
1127+ name + f"_embed256_specaug1_noCTC" ,
1128+ train_args = args ,
1129+ num_epochs = ep ,
1130+ gpu_mem = 11 ,
1131+ bpe_size = BPE_500 ,
1132+ avg_key = "dev_score" ,
1133+ )
1134+
1135+ # TODO: multi-gpu training
1136+ # conf_8l_dimF1.0_bpe500_drop0.1_selfAttDrop0.15_decDrop0.2_embedDrop0.05_wd0.0_ep300_epocOCLR-0.0001-0.001_embed256_specaug1
1137+ # 12.4 11.1 13 avg
1138+
1139+ # TODO: param sync
1140+ # gpu4_paramSync_step50_accum1_gradClipNorm5 13.7 12.1 14.5 avg
1141+ # gpu4_paramSync_step100_accum1_gradClipNorm20 13.8 12.3 14.5 avg
1142+ # gpu4_paramSync_step100_accum1_gradClipNorm5 14 12.1 14.6 avg
1143+ for ep in [50 * 6 ]:
1144+ for num_blocks , reduce_factor in [(8 , 1.0 )]:
1145+ for sync_step in [50 ]:
1146+ for gradient_clip_global_norm in [5 ]:
1147+ for lr_opts in [
1148+ {"lr" : 1e-3 },
1149+ {"lr" : 1e-3 , "initial_lr" : 4e-4 }, # higher initial LR
1150+ {"lr" : 1e-3 , "cyc1_factor" : 0.5 , "cyc2_factor" : 0.5 }, # no fine-tuning
1151+ {"lr" : 1e-3 , "cyc1_factor" : 0.2 , "cyc2_factor" : 0.7 }, # shorted warmup
1152+ ]:
1153+ args , name = get_base_v2_args (
1154+ ep , num_blocks , reduce_factor , lr_type = "epoch-oclr" , lr_opts = lr_opts
1155+ )
1156+ args ["accum_grad" ] = 1
1157+ args ["specaug_version" ] = 1
1158+ args ["decoder_args" ].embed_dim = 256
1159+ args ["horovod_params" ] = {
1160+ "horovod_reduce_type" : "param" ,
1161+ "horovod_param_sync_step" : sync_step ,
1162+ "horovod_dataset_distribution" : "random_seed_offset" ,
1163+ }
1164+ exp_name = name + f"_embed256_specaug1_gpu4_paramSync_step{ sync_step } _accum1"
1165+ if gradient_clip_global_norm :
1166+ args ["gradient_clip_global_norm" ] = gradient_clip_global_norm
1167+ exp_name += f"_gradClipNorm{ gradient_clip_global_norm } "
1168+ run_default_exp (
1169+ exp_name ,
1170+ train_args = args ,
1171+ num_epochs = ep ,
1172+ partition_epoch = 6 * 4 ,
1173+ gpu_mem = 11 ,
1174+ bpe_size = BPE_500 ,
1175+ horovod_num_processes = 4 ,
1176+ )
0 commit comments