Add FilterRecordingsByAlignmentConfidenceJob (#603)

Icemole · web-flow · commit 4795d0a13555 · 2025-05-26T09:02:42.000+02:00
Same as `FilterSegmentsByAlignmentConfidenceJob`, but filters all segments inside a given recording if the average recording confidence score (defined as `avg(segment.score for segment in recording)`) is greater than the thresholds defined by the user.
diff --git a/corpus/filter.py b/corpus/filter.py
@@ -2,17 +2,19 @@
     "FilterSegmentsByListJob",
     "FilterSegmentsByRegexJob",
     "FilterSegmentsByAlignmentConfidenceJob",
+    "FilterRecordingsByAlignmentConfidenceJob",
     "FilterCorpusBySegmentsJob",
     "FilterCorpusRemoveUnknownWordSegmentsJob",
     "FilterCorpusBySegmentDurationJob",
 ]
 
+from collections import defaultdict
 import gzip
 import logging
 import numpy as np
 import re
 import xml.etree.cElementTree as ET
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from i6_core import rasr
 from i6_core.lib import corpus
@@ -155,24 +157,54 @@ def __init__(
         if plot:
             self.out_plot_avg = self.output_path("score.png")
 
-    def tasks(self):
-        yield Task("run", resume="run", mini_task=True)
+    def _parse_alignment_logs(self, alignment_logs: Dict[int, Path]) -> Dict[str, List[Tuple[str, float]]]:
+        """
+        :return: Dictionary of recording full names to list of (segment full name, alignment score).
 
-    def run(self):
-        segment_dict = {}
-        for task_id, log_file in self.alignment_logs.items():
+            Note: the names adhere to the standards of the :class:`i6_core.lib.corpus.Recording`
+            and :class:`i6_core.lib.corpus.Segment` classes,
+            in which the segment name is appended to the full recording name (joined by a slash)
+            to make the full segment name.
+        """
+        recording_dict: Dict[str, List[Tuple[str, float]]] = defaultdict(list)
+        for _, log_file in alignment_logs.items():
             logging.info("Reading: {}".format(log_file))
             file_path = tk.uncached_path(log_file)
             document = ET.parse(uopen(file_path))
             _seg_list = document.findall(".//segment")
             for seg in _seg_list:
                 avg = seg.find(".//score/avg")
-                segment_dict[seg.attrib["full-name"]] = float(avg.text)
+                full_seg_name = seg.attrib["full-name"]
+                full_rec_name = "/".join(full_seg_name.split("/")[:-1])
+                recording_dict[full_rec_name].append((full_seg_name, float(avg.text)))
             del document
+        logging.info("Scores has {} entries.".format(len(recording_dict)))
+
+        return recording_dict
+
+    def _get_alignment_scores_array(self, recording_dict: Dict[str, List[Tuple[str, float]]]) -> np.array:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :return: Array with the alignment confidence scores **per segment**.
+        """
+        return np.asarray(
+            [
+                alignment_score
+                for seg_name_and_score in recording_dict.values()
+                for (_, alignment_score) in seg_name_and_score
+            ]
+        )
 
-        logging.info("Scores has {} entries.".format(len(segment_dict)))
-        score_np = np.asarray(list(segment_dict.values()))
+    def _get_avg_score_threshold(self, recording_dict: Dict[str, List[Tuple[str, float]]]) -> float:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :return: Alignment score threshold below which samples should be kept,
+            and above which samples should be discarded.
+            It's calculated according to the `percentile` and `absolute_threshold` values provided by the user.
+        """
+        score_np = self._get_alignment_scores_array(recording_dict)
         logging.info("Max {}; Min {}; Median {}".format(score_np.max(), score_np.min(), np.median(score_np)))
+
         avg_score_threshold = np.percentile(score_np, self.percentile)
         if np.isnan(avg_score_threshold):
             avg_score_threshold = np.inf
@@ -181,24 +213,29 @@ def run(self):
             avg_score_threshold = min(avg_score_threshold, self.absolute_threshold)
         logging.info("Threshold is {}".format(avg_score_threshold))
 
-        if self.plot:
-            import matplotlib
-
-            matplotlib.use("Agg")
-            import matplotlib.pyplot as plt
+        return avg_score_threshold
 
-            plot_percentile = np.percentile(score_np, 90)  # there can be huge outliers
-            np.clip(score_np, 0, 200, out=score_np)
-            plt.hist(score_np, bins=100, range=(0, 200))
-            plt.xlabel("Average Maximum-Likelihood Score")
-            plt.ylabel("Number of Segments")
-            plt.title("Histogram of Alignment Scores")
-            plt.savefig(fname=self.out_plot_avg.get_path())
-
-        # Only keep segments that are below the threshold
-        filtered_segments = [seg for seg, avg in segment_dict.items() if avg <= avg_score_threshold]
+    def _filter_segments(
+        self, recording_dict: Dict[str, List[Tuple[str, float]]], avg_score_threshold: float
+    ) -> List[str]:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :param avg_score_threshold: Alignment score threshold below which samples should be kept,
+            and above which samples should be discarded.
+        :return: List of segments (represented by their full name) that should be kept.
+        """
+        # Only keep segments that are below the threshold.
+        filtered_segments = [
+            seg for seg_avg in recording_dict.values() for (seg, avg) in seg_avg if avg <= avg_score_threshold
+        ]
         logging.info("Have {} entries after filtering.".format(len(filtered_segments)))
 
+        return filtered_segments
+
+    def _write_output_segment_files(self, filtered_segments: List[str]):
+        """
+        :param filtered_segments: List of segments (represented by their full name) that should be kept.
+        """
         for idx, segments in enumerate(chunks(filtered_segments, self.num_segments)):
             with open(self.out_single_segment_files[idx + 1].get_path(), "wt") as segment_file:
                 for segment in segments:
@@ -208,6 +245,122 @@ def run(self):
             for segment in filtered_segments:
                 segment_file.write(segment + "\n")
 
+    def _plot(self, recording_dict: Dict[str, List[Tuple[str, float]]]):
+        """
+        Plots an alignment score.
+
+        Note: the plot only takes into account strictly positive values.
+        For more customizable plotting, it's suggested to use :class:`i6_core.mm.alignment.PlotAlignmentJob` instead.
+        """
+        import matplotlib
+        import matplotlib.pyplot as plt
+
+        matplotlib.use("Agg")
+
+        score_np = self._get_alignment_scores_array(recording_dict)
+
+        # Before filtering.
+        np.clip(score_np, 0, 200, out=score_np)
+        plt.hist(score_np, bins=100, range=(0, 200))
+        plt.xlabel("Average Maximum-Likelihood Score")
+        plt.ylabel("Number of Segments")
+        plt.title("Histogram of Alignment Scores")
+        plt.savefig(fname=self.out_plot_avg.get_path())
+
+    def tasks(self):
+        yield Task("run", resume="run", mini_task=True)
+
+    def run(self):
+        recording_dict = self._parse_alignment_logs(self.alignment_logs)
+        avg_score_threshold = self._get_avg_score_threshold(recording_dict)
+        filtered_segments = self._filter_segments(recording_dict, avg_score_threshold)
+        self._write_output_segment_files(filtered_segments)
+        self._plot(recording_dict)
+
+
+class FilterRecordingsByAlignmentConfidenceJob(FilterSegmentsByAlignmentConfidenceJob):
+    """
+    Filter segments like :class:`FilterSegmentsByAlignmentConfidenceJob` does.
+    However, instead of taking into account the alignment confidence of a single segment,
+    take into account the average alignment confidence of the whole recording.
+    """
+
+    def __init__(
+        self,
+        alignment_logs: Dict[int, Path],
+        percentile: float,
+        crp: Optional[rasr.CommonRasrParameters] = None,
+        plot: bool = True,
+        absolute_threshold: Optional[float] = None,
+    ):
+        """
+        :param alignment_logs: Mapping of `task_id` into log file.
+            Can be directly used as the output `out_log_file` of the job :class:`i6_core.mm.AlignmentJob`.
+        :param percentile: Percent of recordings whose segments should be keep, in the range `(0,100]`.
+            Used directly in :func:`np.percentile`.
+        :param crp: Used to set the number of output segments.
+            If `None` (default value), all segments in all alignment log files are considered.
+        :param plot: Whether to plot the distribution of alignment scores.
+        :param absolute_threshold: All segments from a recording are discarded
+            if the recording's average alignment score is above this number.
+        """
+        super().__init__(
+            alignment_logs=alignment_logs,
+            percentile=percentile,
+            crp=crp,
+            plot=plot,
+            absolute_threshold=absolute_threshold,
+        )
+
+        self.out_kept_recordings = self.output_path("kept_recordings.txt")
+        self.out_discarded_recordings = self.output_path("discarded_recordings.txt")
+
+    def _get_avg_confidence_per_recording(self, recording_dict: Dict[str, List[Tuple[str, float]]]) -> Dict[str, float]:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :return: Dictionary of recording full names to average recording alignment score
+            (calculated as the average of all alignment scores of the segments that compose the recording).
+        """
+        return {
+            full_rec_name: np.average([conf for (_, conf) in seg_and_confs])
+            for full_rec_name, seg_and_confs in recording_dict.items()
+        }
+
+    def _get_alignment_scores_array(self, recording_dict: Dict[str, List[Tuple[str, float]]]) -> np.array:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :return: Array with the alignment confidence scores **per recording**.
+        """
+        return np.asarray(list(self._get_avg_confidence_per_recording(recording_dict).values()))
+
+    def _filter_segments(
+        self, recording_dict: Dict[str, List[Tuple[str, float]]], avg_score_threshold: float
+    ) -> List[str]:
+        """
+        :param recording_dict: Dictionary of recording full names to list of (segment full name, alignment score).
+        :param avg_score_threshold: Alignment score threshold below which samples should be kept,
+            and above which samples should be discarded.
+        :return: List of segments (represented by their full name) that should be kept.
+        """
+        recording_to_average_conf = self._get_avg_confidence_per_recording(recording_dict)
+
+        filtered_segments = []
+        # Write outputs that are local to this job here to avoid passing more variables around.
+        with uopen(self.out_kept_recordings.get_path(), "wt") as f_kept, uopen(
+            self.out_discarded_recordings.get_path(), "wt"
+        ) as f_discarded:
+            for full_rec_name, avg_alignment_score in recording_to_average_conf.items():
+                if avg_alignment_score <= avg_score_threshold:
+                    # Keep the whole recording.
+                    f_kept.write(f"{full_rec_name} {avg_alignment_score}\n")
+                    for segment_name, _ in recording_dict[full_rec_name]:
+                        filtered_segments.append(segment_name)
+                else:
+                    # Discard the whole recording.
+                    f_discarded.write(f"{full_rec_name} {avg_alignment_score}\n")
+
+        return filtered_segments
+
 
 class FilterCorpusBySegmentsJob(Job):
     __sis_hash_exclude__ = {"delete_empty_recordings": False}