Filter*ByAlignmentConfidenceJob: add flag to disregard alignments that didn't reach a final state (#608)

Icemole · web-flow · commit a5a4600633b7 · 2025-06-18T12:17:40.000+02:00
* Add job

* Fix job

Add relevant imports, remove irrelevant code

* Remove job

* Add `remove_dnf_alignments` flag to parse alignment logs function

* Set flag to true in recording filtering

* Simplify code as per offline review

* Add DNF option to base segment filtering job

Will be much more useful in the long run
diff --git a/corpus/filter.py b/corpus/filter.py
@@ -128,26 +128,37 @@ def run(self):
 
 
 class FilterSegmentsByAlignmentConfidenceJob(Job):
+    __sis_hash_exclude__ = {"remove_dnf_alignments": False}
+
     def __init__(
         self,
         alignment_logs: Dict[int, Path],
         percentile: float,
         crp: Optional[rasr.CommonRasrParameters] = None,
         plot: bool = True,
         absolute_threshold: Optional[float] = None,
+        remove_dnf_alignments: bool = False,
     ):
         """
         :param alignment_logs: alignment_job.out_log_file; task_id -> log_file
         :param percentile: percent of alignment segments to keep. should be in (0,100]. for :func:`np.percentile`
         :param crp: used to set the number of output segments. if none, number of alignment log files is used instead.
         :param plot: plot the distribution of alignment scores
         :param absolute_threshold: alignments with score above this number are discarded
+        :param remove_dnf_alignments: Whether alignments that haven't reached a final state
+            should be considered in the final statistics dictionary.
+
+            Note that these alignments haven't made it to the final alignment caches,
+            so parsing them is inconsistent with respect to the final caches
+            and pollutes any statistics retrieved from the data.
+            The default value is `False` only for retrocompatibility purposes, and `True` is recommended instead.
         """
         self.alignment_logs = alignment_logs  # alignment_job.log_file
         self.percentile = percentile
         self.absolute_threshold = absolute_threshold
         self.num_segments = len(alignment_logs) if crp is None else crp.concurrent
         self.plot = plot
+        self.remove_dnf_alignments = remove_dnf_alignments
 
         self.out_single_segment_files = dict(
             (i, self.output_path("segments.%d" % i)) for i in range(1, self.num_segments + 1)
@@ -157,8 +168,18 @@ def __init__(
         if plot:
             self.out_plot_avg = self.output_path("score.png")
 
-    def _parse_alignment_logs(self, alignment_logs: Dict[int, Path]) -> Dict[str, List[Tuple[str, float]]]:
+    def _parse_alignment_logs(
+        self, alignment_logs: Dict[int, Path], remove_dnf_alignments: bool = False
+    ) -> Dict[str, List[Tuple[str, float]]]:
         """
+        :param alignment_logs: Alignment logs to analyze.
+        :param remove_dnf_alignments: Whether alignments that haven't reached a final state
+            should be considered in the final statistics dictionary.
+
+            Note that these alignments haven't made it to the final alignment caches,
+            so parsing them is inconsistent with respect to the final caches
+            and pollutes any statistics retrieved from the data.
+            The default value is `False` only for retrocompatibility purposes, and `True` is recommended instead.
         :return: Dictionary of recording full names to list of (segment full name, alignment score).
 
             Note: the names adhere to the standards of the :class:`i6_core.lib.corpus.Recording`
@@ -173,6 +194,10 @@ def _parse_alignment_logs(self, alignment_logs: Dict[int, Path]) -> Dict[str, Li
             document = ET.parse(uopen(file_path))
             _seg_list = document.findall(".//segment")
             for seg in _seg_list:
+                if remove_dnf_alignments and any(
+                    "Alignment did not reach any final state." in warning.text for warning in seg.findall(".//warning")
+                ):
+                    continue
                 avg = seg.find(".//score/avg")
                 full_seg_name = seg.attrib["full-name"]
                 full_rec_name = "/".join(full_seg_name.split("/")[:-1])
@@ -271,7 +296,9 @@ def tasks(self):
         yield Task("run", resume="run", mini_task=True)
 
     def run(self):
-        recording_dict = self._parse_alignment_logs(self.alignment_logs)
+        recording_dict = self._parse_alignment_logs(
+            self.alignment_logs, remove_dnf_alignments=self.remove_dnf_alignments
+        )
         avg_score_threshold = self._get_avg_score_threshold(recording_dict)
         filtered_segments = self._filter_segments(recording_dict, avg_score_threshold)
         self._write_output_segment_files(filtered_segments)
@@ -361,6 +388,14 @@ def _filter_segments(
 
         return filtered_segments
 
+    def run(self):
+        # Alignments that haven't reached a final state can bias the mean computation, so they're removed.
+        recording_dict = self._parse_alignment_logs(self.alignment_logs, remove_dnf_alignments=True)
+        avg_score_threshold = self._get_avg_score_threshold(recording_dict)
+        filtered_segments = self._filter_segments(recording_dict, avg_score_threshold)
+        self._write_output_segment_files(filtered_segments)
+        self._plot(recording_dict)
+
 
 class FilterCorpusBySegmentsJob(Job):
     __sis_hash_exclude__ = {"delete_empty_recordings": False}