rwth-i6
diff --git a/‎example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py‎
Lines changed: 23 additions & 0 deletions b/‎example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py‎
Lines changed: 1 addition & 0 deletions b/‎example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py‎
Lines changed: 87 additions & 0 deletions b/‎example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎users/berger/args/experiments/ctc.py‎
Lines changed: 1 addition & 1 deletion b/‎users/berger/args/experiments/ctc.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎users/berger/args/experiments/transducer.py‎
Lines changed: 1 addition & 1 deletion b/‎users/berger/args/experiments/transducer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎users/berger/args/jobs/rasr_init_args.py‎
Lines changed: 25 additions & 0 deletions b/‎users/berger/args/jobs/rasr_init_args.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎users/berger/args/returnn/config.py‎
Lines changed: 1 addition & 0 deletions b/‎users/berger/args/returnn/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎users/berger/args/returnn/learning_rates.py‎
Lines changed: 55 additions & 0 deletions b/‎users/berger/args/returnn/learning_rates.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎users/berger/args/returnn/regularization.py‎
Lines changed: 1 addition & 1 deletion b/‎users/berger/args/returnn/regularization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎users/berger/configs/librispeech/20230210_baselines/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎users/berger/configs/librispeech/20230210_baselines/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -61,6 +61,7 @@ def bpe_ls960_1023_base():
     }
 
     from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig
+    from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig
 
     def tune_and_evaluate_helper(
         training_name: str,
@@ -121,6 +122,22 @@ def tune_and_evaluate_helper(
                 **default_returnn,
             )
 
+    def greedy_search_helper(training_name: str, asr_model: ASRModel, decoder_config: GreedyDecoderConfig):
+        # remove prior if exists
+        asr_model = copy.deepcopy(asr_model)
+        asr_model.prior_file = None
+
+        search_name = training_name + "/search_greedy"
+        search_jobs, wers = search(
+            search_name,
+            forward_config={},
+            asr_model=asr_model,
+            decoder_module="ctc.decoder.greedy_bpe_ctc_v3",
+            decoder_args={"config": asdict(decoder_config)},
+            test_dataset_tuples=dev_dataset_tuples,
+            **default_returnn,
+        )
+
     default_decoder_config_bpe5000 = DecoderConfig(
         lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=5000),
         returnn_vocab=label_datastream_bpe5000.vocab,
@@ -200,6 +217,7 @@ def tune_and_evaluate_helper(
         "max_seq_length": {"audio_features": 35 * 16000},
         "accum_grad_multiple_step": 1,
         "torch_amp_options": {"dtype": "bfloat16"},
+        "gradient_clip": 1.0,
     }
 
     network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"
@@ -224,3 +242,8 @@ def tune_and_evaluate_helper(
         lm_scales=[1.6, 1.8, 2.0],
         prior_scales=[0.2, 0.3, 0.4],
     )
+
+    greedy_decoder_config = GreedyDecoderConfig(
+        returnn_vocab=label_datastream_bpe5000.vocab,
+    )
+    greedy_search_helper(training_name=training_name, asr_model=asr_model, decoder_config=greedy_decoder_config)
@@ -195,6 +195,7 @@ def tune_and_evaluate_helper(
         "max_seq_length": {"audio_features": 35 * 16000},
         "accum_grad_multiple_step": 1,
         "torch_amp_options": {"dtype": "bfloat16"},
+        "gradient_clip": 1.0,
     }
 
     network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"
 
@@ -0,0 +1,87 @@
+"""
+Greedy CTC decoder without any extras
+
+v3: add config objects
+"""
+from dataclasses import dataclass
+import time
+import torch
+
+
+@dataclass
+class DecoderConfig:
+    returnn_vocab: str
+
+
+@dataclass
+class ExtraConfig:
+    # used for RTF logging
+    print_rtf: bool = True
+    sample_rate: int = 16000
+
+    # Hypothesis logging
+    print_hypothesis: bool = True
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    config = DecoderConfig(**kwargs["config"])
+    extra_config_dict = kwargs.get("extra_config", {})
+    extra_config = ExtraConfig(**extra_config_dict)
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=config.returnn_vocab, unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.print_rtf = extra_config.print_rtf
+    if run_ctx.print_rtf:
+        run_ctx.running_audio_len_s = 0
+        run_ctx.total_time = 0
+
+    run_ctx.print_hypothesis = extra_config.print_hypothesis
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+
+    if run_ctx.print_rtf:
+        run_ctx.running_audio_len_s += audio_len_batch
+        am_start = time.time()
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    batch_indices = []
+    for lp, l in zip(logprobs, audio_features_len):
+        batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())
+
+    if run_ctx.print_rtf:
+        am_time = time.time() - am_start
+        run_ctx.total_time += am_time
+        print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+
+    tags = data["seq_tag"]
+
+    for indices, tag in zip(batch_indices, tags):
+        sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
+        sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
+        text = " ".join(sequence).replace("@@ ", "")
+        if run_ctx.print_hypothesis:
+            print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
@@ -67,7 +67,7 @@ def get_ctc_recog_step_args(num_classes: int, reduction_factor: int = 4, **kwarg
             "mem_rqmt": 16,
         },
         "rtf": 20,
-        "mem": 4,
+        "mem": 8,
     }
 
     return recursive_update(default_args, kwargs)
 
@@ -68,7 +68,7 @@ def get_transducer_recog_step_args(
             "mem_rqmt": 16,
         },
         "rtf": 50,
-        "mem": 4,
+        "mem": 8,
     }
 
     return recursive_update(default_args, kwargs)
 
@@ -91,6 +91,7 @@ def get_feature_extraction_args_16kHz(
     gt_args: Optional[Dict] = None,
 ) -> Dict:
     mfcc_filter_width = features.filter_width_from_channels(channels=20, f_max=8000)  # = 16000 / 2
+    filterbank_filter_width = features.filter_width_from_channels(channels=80, f_max=8000)  # = 16000 / 2
 
     if mfcc_cepstrum_options is None:
         mfcc_cepstrum_options = {
@@ -142,6 +143,30 @@ def get_feature_extraction_args_16kHz(
                 "normalization_options": {},
             }
         },
+        "filterbank": {
+            "filterbank_options": {
+                "warping_function": "mel",
+                "filter_width": filterbank_filter_width,
+                "normalize": False,
+                "normalization_options": {},
+                "without_samples": False,
+                "samples_options": {
+                    "audio_format": "wav",
+                    # "scale_input": 2**-15,
+                    "dc_detection": dc_detection,
+                },
+                "fft_options": {
+                    "preemphasis": 0.97,
+                    "window_type": "hanning",
+                    "window_shift": 0.01,
+                    "window_length": 0.025,
+                },
+                "apply_log": True,
+                "add_epsilon": True,
+                "add_features_output": True,
+                # "warp_differential_unit": False,
+            },
+        },
         "energy": {
             "energy_options": {
                 "without_samples": False,
 
@@ -21,6 +21,7 @@ def get_base_config(backend: Backend) -> Dict[str, Any]:
     elif backend == Backend.PYTORCH:
         result["backend"] = "torch"
         result["use_lovely_tensors"] = True
+        # result["torch_amp"] = {"dtype": "bfloat16"}
     else:
         raise NotImplementedError
     return result
 
@@ -10,6 +10,7 @@ class LearningRateSchedules(Enum):
     NewbobAbs = auto()
     OCLR = auto()
     OCLR_STEP = auto()
+    OCLR_STEP_TORCH = auto()
     CONST_DECAY = auto()
     CONST_DECAY_STEP = auto()
 
@@ -38,6 +39,8 @@ def get_learning_rate_config(
         config.update(get_oclr_config(**kwargs))
     elif schedule == LearningRateSchedules.OCLR_STEP:
         extra_python.append(get_oclr_function(**kwargs))
+    elif schedule == LearningRateSchedules.OCLR_STEP_TORCH:
+        extra_python.append(get_oclr_function_torch(**kwargs))
     elif schedule == LearningRateSchedules.CONST_DECAY:
         config.update(get_const_decay_config(**kwargs))
     elif schedule == LearningRateSchedules.CONST_DECAY_STEP:
@@ -184,6 +187,58 @@ def get_oclr_function(
     )
 
 
+def get_oclr_function_torch(
+    num_epochs: int,
+    n_steps_per_epoch: int,
+    peak_lr: float = 1e-03,
+    inc_epochs: Optional[int] = None,
+    dec_epochs: Optional[int] = None,
+    initial_lr: Optional[float] = None,
+    decayed_lr: Optional[float] = None,
+    final_lr: Optional[float] = None,
+    **kwargs,
+) -> str:
+    initial_lr = initial_lr or peak_lr / 10
+    decayed_lr = decayed_lr or initial_lr
+    final_lr = final_lr or initial_lr / 5
+    inc_epochs = inc_epochs or (num_epochs * 9) // 20
+    dec_epochs = dec_epochs or inc_epochs
+
+    return dedent(
+        f"""def dynamic_learning_rate(*, global_train_step: int, **_):
+            # Increase linearly from initial_lr to peak_lr over the first inc_epoch epochs
+            # Decrease linearly from peak_lr to decayed_lr over the next dec_epoch epochs
+            # Decrease linearly from decayed_lr to final_lr over the remaining epochs
+            initial_lr = {initial_lr}
+            peak_lr = {peak_lr}
+            decayed_lr = {decayed_lr}
+            final_lr = {final_lr}
+            inc_epochs = {inc_epochs}
+            dec_epochs = {dec_epochs}
+            total_epochs = {num_epochs}
+            n_steps_per_epoch = {n_steps_per_epoch}
+
+            # -- derived -- #
+            steps_increase = inc_epochs * n_steps_per_epoch
+            steps_decay = dec_epochs * n_steps_per_epoch
+            steps_final = (total_epochs - inc_epochs - dec_epochs) * n_steps_per_epoch
+
+            step_size_increase = (peak_lr - initial_lr) / steps_increase
+            step_size_decay = (peak_lr - decayed_lr) / steps_decay
+            step_size_final = (decayed_lr - final_lr) / steps_final
+
+            if global_train_step <= steps_increase:
+                return initial_lr + step_size_increase * global_train_step
+            if global_train_step <= steps_increase + steps_decay:
+                return peak_lr - step_size_decay * (global_train_step - steps_increase)
+            
+            return max(
+                decayed_lr - step_size_final * (global_train_step - steps_increase - steps_decay),
+                final_lr
+            )"""
+    )
+
+
 def get_const_decay_config(
     num_epochs: int,
     const_lr: float = 1e-03,
 
@@ -13,7 +13,7 @@ def get_chunking_config(
 
     if isinstance(chunking_factors, list):
         chunking_factors = {key: 1 for key in chunking_factors}
-    assert isinstance(chunking_factors, Dict)
+    assert isinstance(chunking_factors, dict)
     return {
         "chunking": (
             {key: base_chunk_size // factor for key, factor in chunking_factors.items()},
 
@@ -15,6 +15,7 @@
 from .config_02c_transducer_rasr_features_wei_lex import py as py_02c
 from .config_02e_transducer_rasr_features_tinaconf import py as py_02e
 from .config_02e_transducer_rasr_features_tinaconf_rtf import py as py_02e_rtf
+from .config_02f_transducer_rasr_features_am_scales import py as py_02f
 from .config_03a_transducer_fullsum_raw_samples import py as py_03a
 from .config_03b_transducer_fullsum_rasr_features import py as py_03b
 from .config_03c_transducer_fullsum_rasr_features_wei_lex import py as py_03c
@@ -37,6 +38,7 @@ def main() -> SummaryReport:
     sub_reports.append(copy.deepcopy(py_02c()[0]))
     sub_reports.append(copy.deepcopy(py_02e()))
     sub_reports.append(copy.deepcopy(py_02e_rtf()))
+    sub_reports.append(copy.deepcopy(py_02f()))
     sub_reports.append(copy.deepcopy(py_03a()))
     sub_reports.append(copy.deepcopy(py_03b()))
     sub_reports.append(copy.deepcopy(py_03c()))
Original file line number	Diff line number	Diff line change
`@@ -195,6 +195,7 @@ def tune_and_evaluate_helper(`
`195`	`195`	`"max_seq_length": {"audio_features": 35 * 16000},`
`196`	`196`	`"accum_grad_multiple_step": 1,`
`197`	`197`	`"torch_amp_options": {"dtype": "bfloat16"},`
	`198`	`+ "gradient_clip": 1.0,`
`198`	`199`	`}`
`199`	`200`
`200`	`201`	`network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ def get_ctc_recog_step_args(num_classes: int, reduction_factor: int = 4, **kwarg`
`67`	`67`	`"mem_rqmt": 16,`
`68`	`68`	`},`
`69`	`69`	`"rtf": 20,`
`70`		`- "mem": 4,`
	`70`	`+ "mem": 8,`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`return recursive_update(default_args, kwargs)`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ def get_transducer_recog_step_args(`
`68`	`68`	`"mem_rqmt": 16,`
`69`	`69`	`},`
`70`	`70`	`"rtf": 50,`
`71`		`- "mem": 4,`
	`71`	`+ "mem": 8,`
`72`	`72`	`}`
`73`	`73`
`74`	`74`	`return recursive_update(default_args, kwargs)`