Add job to plot Viterbi alignment (#558)

Icemole · web-flow · commit f48ebf5eea70 · 2025-05-22T13:03:10.000+02:00
The job plots alignments from a set of alignment caches in an upward trend, as expected from a usual alignment plot.
diff --git a/mm/alignment.py b/mm/alignment.py
@@ -1,10 +1,12 @@
 __all__ = [
+    "get_segment_name_to_alignment_mapping",
     "AlignmentJob",
     "DumpAlignmentJob",
     "PlotAlignmentJob",
     "AMScoresFromAlignmentLogJob",
     "ComputeTimeStampErrorJob",
     "GetLongestAllophoneFileJob",
+    "PlotViterbiAlignmentJob",
 ]
 
 import itertools
@@ -13,20 +15,40 @@
 import os
 import shutil
 import statistics
+from typing import Callable, Counter, Dict, List, Optional, Tuple, Union
 import xml.etree.ElementTree as ET
-from typing import Callable, Counter, List, Optional, Tuple, Union
 
+import numpy as np
 from sisyphus import *
 
-Path = setup_path(__package__)
-
+import i6_core.lib.corpus as corpus
 import i6_core.lib.rasr_cache as rasr_cache
 import i6_core.rasr as rasr
 import i6_core.util as util
 
 from .flow import alignment_flow, dump_alignment_flow
 
 
+Path = setup_path(__package__)
+
+
+_SegmentNameToAlignmentType = Dict[str, List[Tuple[int, int, int, float]]]
+"""Mapping from segment names to `(timestamp, allophone_id, hmm_state, alignment_weight)`."""
+
+
+def get_segment_name_to_alignment_mapping(alignment_cache: rasr_cache.FileArchive) -> _SegmentNameToAlignmentType:
+    """
+    :param alignment_cache: Opened alignment cache from which to extract the alignments.
+    :return: Mapping from segment names to alignments (by frame).
+        The alignments are a list of tuples (timestamp, allophone_id, hmm_state, alignment_weight).
+    """
+    return {
+        segment_name: alignment_cache.read(segment_name, "align")
+        for segment_name in alignment_cache.ft.keys()
+        if not segment_name.endswith(".attribs")
+    }
+
+
 class AlignmentJob(rasr.RasrCommand, Job):
     """
     Align a dataset with the given feature scorer.
@@ -153,7 +175,6 @@ def run(self, task_id):
             )
 
     def plot(self):
-        import numpy as np
         import matplotlib
         import matplotlib.pyplot as plt
 
@@ -464,7 +485,6 @@ def tasks(self):
         yield Task("plot", resume="plot", rqmt=self.rqmt)
 
     def plot(self):
-        import numpy as np
         import matplotlib
         import matplotlib.pyplot as plt
 
@@ -825,3 +845,163 @@ def run(self):
                 line_set = {*lines} - {None}
                 assert len(line_set) == 1, f"Line {i}: expected only one allophone, but found two or more: {line_set}."
                 f.write(list(line_set)[0])
+
+
+class PlotViterbiAlignmentJob(Job):
+    """
+    Plots the alignments of each segment in the specified alignment files.
+    """
+
+    def __init__(
+        self,
+        alignment_caches: List[tk.Path],
+        allophone_file: tk.Path,
+        segment_names_to_plot: Optional[tk.Path] = None,
+        corpus_file: Optional[tk.Path] = None,
+    ):
+        """
+        :param alignment_caches: Alignment files to be plotted.
+        :param allophone_file: Allophone file used in the alignment process.
+        :param segment_names_to_plot: Specific segment names to plot.
+            By default, plot all segments given in :param:`alignment_caches`.
+        :param corpus_file: Corpus used to generate the alignments. By default, the plots have no title.
+            If provided, the plots will have the text from the respective segment as title,
+            whenever the segment is available in the corpus. This should only be given for convenience.
+        """
+        self.alignment_caches = alignment_caches
+        self.allophone_file = allophone_file
+        self.segment_names_to_plot = segment_names_to_plot
+        self.corpus_file = corpus_file
+
+        self.out_plot_dir = self.output_path("plots", directory=True)
+
+        self.rqmt = {"cpu": 1, "mem": 2.0, "time": 1.0}
+
+    def tasks(self):
+        yield Task("run", resume="run", rqmt=self.rqmt, args=range(1, len(self.alignment_caches) + 1))
+
+    def extract_phoneme_sequence(self, alignment: np.array) -> Tuple[np.array, np.array]:
+        """
+        :param alignment: Monophone alignment, for instance: `np.array(["a", "a", "b", "b", "b", "c", ...])`.
+        :return:
+            - Monophone sequence (ordered as provided in :param:`alignment`).
+
+            - **Indices** corresponding to the monophone sequence from the Viterbi alignment.
+            In the example above, these would be `[0, 0, 1, 1, 1, 2, ...]`.
+        """
+        boundaries = np.concatenate(
+            [
+                np.where(alignment[:-1] != alignment[1:])[0],
+                [len(alignment) - 1],  # manually add boundary of last allophone
+            ]
+        )
+
+        lengths = boundaries - np.concatenate([[-1], boundaries[:-1]])
+        phonemes = alignment[boundaries]
+        monotonic_idx_alignment = np.repeat(np.arange(len(phonemes)), lengths)
+        return phonemes, monotonic_idx_alignment
+
+    def make_viterbi_matrix(self, label_indices: np.array) -> np.array:
+        """
+        :param label_indices: Sequence of label (allophone) indices,
+            corresponding to the monophone sequence from the Viterbi alignment.
+
+            For example, for an alignment of `np.array(["a", "a", "b", "b", "b", "c", ...])`,
+            :param:`label_indices` would be `[0, 0, 1, 1, 1, 2, ...]`.
+        :return: Matrix corresponding to the Viterbi alignment.
+        """
+        num_timestamps = len(label_indices)
+        num_allophones = max(label_indices) + 1
+        # Place the timestamps on the Y axis because we'll map the label indices to the different phonemes there.
+        viterbi_matrix = np.zeros((num_allophones, num_timestamps), dtype=bool)
+        for i, t_i in enumerate(label_indices):
+            viterbi_matrix[t_i, i] = True
+        return viterbi_matrix
+
+    def plot(self, viterbi_matrix: np.array, allophone_sequence: List[str], file_name: str, title: str = ""):
+        """
+        :param viterbi_matrix: Matrix to be plotted, corresponding to the Viterbi alignment.
+        :param allophone_sequence: Allophone sequence (Y-axis tick labels).
+        :param file_name: File name where to store the plot, relative to `<job>/output/plots/`.
+        :param title: Optional title to add to the image. By default there will be no title.
+        :return: Plot corresponding to the monotonic alignment.
+        """
+        import matplotlib
+        import matplotlib.pyplot as plt
+
+        matplotlib.use("Agg")
+
+        num_allophones, num_timestamps = np.shape(viterbi_matrix)
+
+        fig, ax = plt.subplots(figsize=(10, 10))
+        ax.set_xlabel("Frame")
+        ax.xaxis.set_label_coords(0.98, -0.03)
+        ax.set_xbound(0, num_timestamps - 1)
+        ax.set_ybound(-0.5, num_allophones - 0.5)
+
+        ax.set_yticks(np.arange(num_allophones))
+        ax.set_yticklabels(allophone_sequence)
+
+        ax.set_title(title)
+
+        ax.imshow(viterbi_matrix, cmap="Blues", interpolation="none", aspect="auto", origin="lower")
+
+        # The plot will be purposefully divided into subdirectories.
+        os.makedirs(os.path.dirname(os.path.join(self.out_plot_dir.get_path(), file_name)), exist_ok=True)
+        fig.savefig(os.path.join(self.out_plot_dir.get_path(), f"{file_name}.png"))
+        matplotlib.pyplot.close(fig)
+
+    def run(self, task_id: int):
+        if self.segment_names_to_plot is not None:
+            # Load the segment names to plot.
+            with util.uopen(self.segment_names_to_plot.get_path(), "rt") as f:
+                segment_names_to_plot = {seg_name.strip() for seg_name in f}
+            # Load the segment names from the alignment caches.
+            align_cache = rasr_cache.FileArchive(self.alignment_caches[task_id - 1].get_path())
+            align_cache.setAllophones(self.allophone_file.get_path())
+            seg_name_to_alignments = {
+                seg_name: alignments
+                for seg_name, alignments in get_segment_name_to_alignment_mapping(align_cache).items()
+                # Only load the specific segment names that the user has provided.
+                if seg_name in segment_names_to_plot
+            }
+        else:
+            # Load the segment names from the alignment caches.
+            align_cache = rasr_cache.FileArchive(self.alignment_caches[task_id - 1].get_path())
+            align_cache.setAllophones(self.allophone_file.get_path())
+            seg_name_to_alignments = get_segment_name_to_alignment_mapping(align_cache)
+            # Plot everything from the local alignment cache.
+            segment_names_to_plot = seg_name_to_alignments.keys()
+
+        seg_name_to_text = {}
+        if self.corpus_file is not None:
+            c = corpus.Corpus()
+            c.load(self.corpus_file.get_path())
+            seg_name_to_text = {seg_name: segment.full_orth() for seg_name, segment in c.get_segment_mapping().items()}
+
+        empty_alignment_seg_names = []
+        for seg_name in segment_names_to_plot:
+            alignments = seg_name_to_alignments.get(seg_name, None)
+            if alignments is None:
+                continue
+            # In some rare cases, the alignment doesn't have to reach a satisfactory end.
+            # In these cases, the final alignment is empty. Skip those cases.
+            if len(alignments) == 0:
+                empty_alignment_seg_names.append(seg_name)
+                continue
+
+            for i, (_, allo_id, _, _) in enumerate(alignments):
+                allophone = align_cache.allophones[allo_id]
+                # Get the central part of the allophone.
+                seg_name_to_alignments[seg_name][i] = allophone.split("{")[0]
+
+            center_allophones = np.array(seg_name_to_alignments[seg_name])
+            phonemes, alignment_indices = self.extract_phoneme_sequence(center_allophones)
+            viterbi_matrix = self.make_viterbi_matrix(alignment_indices)
+            self.plot(viterbi_matrix, phonemes, file_name=seg_name, title=seg_name_to_text.get(seg_name, ""))
+
+        if empty_alignment_seg_names:
+            logging.warning(
+                "The following alignments weren't plotted because their alignments were empty:\n"
+                f"{empty_alignment_seg_names}"
+            )