@@ -204,64 +204,68 @@ def run_exp(ft_name, datasets, train_args, search_args=None, with_prior=False, n
204204
205205
206206 # from here on onwards, use default AdamW with same OCLR
207- train_args_adamw_02 = {
208- "config" : {
209- "optimizer" : {"class" : "adamw" , "epsilon" : 1e-16 , "weight_decay" : 1e-2 },
210- "learning_rates" : list (np .linspace (1e-5 , 1e-3 , 150 )) + list (np .linspace (1e-3 , 1e-6 , 150 )),
211- #############
212- "batch_size" : 200 * 16000 ,
213- "max_seq_length" : {"audio_features" : 35 * 16000 },
214- "accum_grad_multiple_step" : 2 ,
215- },
216- }
217-
218- model_config_smaller = ModelConfig (
219- feature_extraction_config = fe_config ,
220- frontend_config = frontend_config ,
221- specaug_config = specaug_config ,
222- label_target_size = vocab_size_without_blank ,
223- conformer_size = 384 ,
224- num_layers = 12 ,
225- num_heads = 4 ,
226- ff_dim = 384 ,
227- att_weights_dropout = 0.2 ,
228- conv_dropout = 0.2 ,
229- ff_dropout = 0.2 ,
230- mhsa_dropout = 0.2 ,
231- conv_kernel_size = 9 ,
232- final_dropout = 0.2 ,
233- specauc_start_epoch = 1 ,
234- )
207+ # train_args_adamw_02 = {
208+ # "config": {
209+ # "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2},
210+ # "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)),
211+ # #############
212+ # "batch_size": 200 * 16000,
213+ # "max_seq_length": {"audio_features": 35 * 16000},
214+ # "accum_grad_multiple_step": 2,
215+ # },
216+ # }
235217
236- train_args = {
237- ** copy .deepcopy (train_args_adamw_02 ),
238- "network_module" : "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6" ,
239- "net_args" : {"model_config_dict" : asdict (model_config_smaller )},
240- }
241- for lm_weight in [1.6 , 1.8 , 2.0 , 2.2 ]:
242- for prior_scale in [0.3 , 0.5 ]:
243- search_args = {
244- ** default_search_args ,
245- "lm_weight" : lm_weight ,
246- "prior_scale" : prior_scale ,
247- }
248- run_exp (
249- prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" % (
250- lm_weight , prior_scale ),
251- datasets = train_data , train_args = train_args , search_args = search_args , with_prior = True )
218+ # model_config_smaller = ModelConfig(
219+ # feature_extraction_config=fe_config,
220+ # frontend_config=frontend_config,
221+ # specaug_config=specaug_config,
222+ # label_target_size=vocab_size_without_blank,
223+ # conformer_size=384,
224+ # num_layers=12,
225+ # num_heads=4,
226+ # ff_dim=384,
227+ # att_weights_dropout=0.2,
228+ # conv_dropout=0.2,
229+ # ff_dropout=0.2,
230+ # mhsa_dropout=0.2,
231+ # conv_kernel_size=9,
232+ # final_dropout=0.2,
233+ # specauc_start_epoch=1,
234+ # )
235+ #
236+ # train_args = {
237+ # **copy.deepcopy(train_args_adamw_02),
238+ # "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
239+ # "net_args": {"model_config_dict": asdict(model_config_smaller)},
240+ # }
252241
253- model_config_smaller_start11 = copy .deepcopy (model_config_smaller )
254- model_config_smaller_start11 .specauc_start_epoch = 11
255- train_args_start11 = copy .deepcopy (train_args )
256- train_args_start11 ["net_args" ]["model_config_dict" ] = asdict (model_config_smaller_start11 )
257- for lm_weight in [1.6 , 1.8 , 2.0 , 2.2 ]:
258- for prior_scale in [0.3 , 0.5 ]:
259- search_args = {
260- ** default_search_args ,
261- "lm_weight" : lm_weight ,
262- "prior_scale" : prior_scale ,
263- }
264- run_exp (
265- prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14" % (
266- lm_weight , prior_scale ),
267- datasets = train_data , train_args = train_args_start11 , search_args = search_args , with_prior = True )
242+ # Diverged
243+
244+ # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
245+ # for prior_scale in [0.3, 0.5]:
246+ # search_args = {
247+ # **default_search_args,
248+ # "lm_weight": lm_weight,
249+ # "prior_scale": prior_scale,
250+ # }
251+ # run_exp(
252+ # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" % (
253+ # lm_weight, prior_scale),
254+ # datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True)
255+
256+ # This one was worse than the baseline 16.5 -> 17.9
257+ # model_config_smaller_start11 = copy.deepcopy(model_config_smaller)
258+ # model_config_smaller_start11.specauc_start_epoch = 11
259+ # train_args_start11 = copy.deepcopy(train_args)
260+ # train_args_start11["net_args"]["model_config_dict"] = asdict(model_config_smaller_start11)
261+ # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
262+ # for prior_scale in [0.3, 0.5]:
263+ # search_args = {
264+ # **default_search_args,
265+ # "lm_weight": lm_weight,
266+ # "prior_scale": prior_scale,
267+ # }
268+ # run_exp(
269+ # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14" % (
270+ # lm_weight, prior_scale),
271+ # datasets=train_data, train_args=train_args_start11, search_args=search_args, with_prior=True)
0 commit comments