Merge pull request matplotlib#30608 from QuLogic/simpler-track

QuLogic · web-flow · commit ed4ca6cfb00a · 2025-10-02T19:00:16.000-04:00
Prepare `CharacterTracker` for advanced font features
diff --git a/lib/matplotlib/backends/_backend_pdf_ps.py b/lib/matplotlib/backends/_backend_pdf_ps.py
@@ -22,6 +22,12 @@
     from fontTools.ttLib import TTFont
 
 
+_FONT_MAX_GLYPH = {
+    3: 256,
+    42: 65536,
+}
+
+
 @functools.lru_cache(50)
 def _cached_get_afm_from_fname(fname):
     with open(fname, "rb") as fh:
@@ -103,6 +109,57 @@ def font_as_file(font):
     return fh
 
 
+class GlyphMap:
+    """
+    A two-way glyph mapping.
+
+    The forward glyph map is from (character string, glyph index)-pairs to
+    (subset index, subset character code)-pairs.
+
+    The inverse glyph map is from to (subset index, subset character code)-pairs to
+    (character string, glyph index)-pairs.
+    """
+
+    def __init__(self) -> None:
+        self._forward: dict[tuple[CharacterCodeType, GlyphIndexType],
+                            tuple[int, CharacterCodeType]] = {}
+        self._inverse: dict[tuple[int, CharacterCodeType],
+                            tuple[CharacterCodeType, GlyphIndexType]] = {}
+
+    def get(self, charcodes: str,
+            glyph_index: GlyphIndexType) -> tuple[int, CharacterCodeType] | None:
+        """
+        Get the forward mapping from a (character string, glyph index)-pair.
+
+        This may return *None* if the pair is not currently mapped.
+        """
+        return self._forward.get((charcodes, glyph_index))
+
+    def iget(self, subset: int,
+             subset_charcode: CharacterCodeType) -> tuple[str, GlyphIndexType]:
+        """Get the inverse mapping from a (subset, subset charcode)-pair."""
+        return self._inverse[(subset, subset_charcode)]
+
+    def add(self, charcode: str, glyph_index: GlyphIndexType, subset: int,
+            subset_charcode: CharacterCodeType) -> None:
+        """
+        Add a mapping to this instance.
+
+        Parameters
+        ----------
+        charcode : CharacterCodeType
+            The character code to record.
+        glyph : GlyphIndexType
+            The corresponding glyph index to record.
+        subset : int
+            The subset in which the subset character code resides.
+        subset_charcode : CharacterCodeType
+            The subset character code within the above subset.
+        """
+        self._forward[(charcode, glyph_index)] = (subset, subset_charcode)
+        self._inverse[(subset, subset_charcode)] = (charcode, glyph_index)
+
+
 class CharacterTracker:
     """
     Helper for font subsetting by the PDF and PS backends.
@@ -114,16 +171,20 @@ class CharacterTracker:
     ----------
     subset_size : int
         The size at which characters are grouped into subsets.
-    used : dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]]
+    used : dict
         A dictionary of font files to character maps.
 
-        The key is a font filename and subset within that font.
+        The key is a font filename.
 
-        The value is a dictionary mapping a character code to a glyph index. Note this
-        mapping is the inverse of FreeType, which maps glyph indices to character codes.
+        The value is a list of dictionaries, each mapping at most *subset_size*
+        character codes to glyph indices. Note this mapping is the inverse of FreeType,
+        which maps glyph indices to character codes.
 
         If *subset_size* is not set, then there will only be one subset per font
         filename.
+    glyph_maps : dict
+        A dictionary of font files to glyph maps. You probably will want to use the
+        `.subset_to_unicode` method instead of this attribute.
     """
 
     def __init__(self, subset_size: int = 0):
@@ -134,7 +195,8 @@ def __init__(self, subset_size: int = 0):
             The maximum size that is supported for an embedded font. If provided, then
             characters will be grouped into these sized subsets.
         """
-        self.used: dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]] = {}
+        self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
+        self.glyph_maps: dict[str, GlyphMap] = {}
         self.subset_size = subset_size
 
     def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
@@ -157,33 +219,24 @@ def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
             whole). If *subset_size* is not specified, then the subset will always be 0
             and the character codes will be returned from the string unchanged.
         """
-        font_glyphs = []
-        char_to_font = font._get_fontmap(s)
-        for _c, _f in char_to_font.items():
-            charcode = ord(_c)
-            glyph_index = _f.get_char_index(charcode)
-            if self.subset_size != 0:
-                subset = charcode // self.subset_size
-                subset_charcode = charcode % self.subset_size
-            else:
-                subset = 0
-                subset_charcode = charcode
-            self.used.setdefault((_f.fname, subset), {})[subset_charcode] = glyph_index
-            font_glyphs.append((subset, subset_charcode))
-        return font_glyphs
-
-    def track_glyph(
-            self, font: FT2Font, charcode: CharacterCodeType,
-            glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
+        return [
+            self.track_glyph(f, ord(c), f.get_char_index(ord(c)))
+            for c, f in font._get_fontmap(s).items()
+        ]
+
+    def track_glyph(self, font: FT2Font, chars: str | CharacterCodeType,
+                    glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
         """
         Record character code *charcode* at glyph index *glyph* as using font *font*.
 
         Parameters
         ----------
         font : FT2Font
             A font that is being used for the provided string.
-        charcode : CharacterCodeType
-            The character code to record.
+        chars : str or CharacterCodeType
+            The character(s) to record. This may be a single character code, or multiple
+            characters in a string, if the glyph maps to several characters. It will be
+            normalized to a string internally.
         glyph : GlyphIndexType
             The corresponding glyph index to record.
 
@@ -196,33 +249,64 @@ def track_glyph(
             The character code within the above subset. If *subset_size* was not
             specified on this instance, then this is just *charcode* unmodified.
         """
-        if self.subset_size != 0:
-            subset = charcode // self.subset_size
-            subset_charcode = charcode % self.subset_size
+        if isinstance(chars, str):
+            charcode = ord(chars[0])
+        else:
+            charcode = chars
+            chars = chr(chars)
+
+        glyph_map = self.glyph_maps.setdefault(font.fname, GlyphMap())
+        if result := glyph_map.get(chars, glyph):
+            return result
+
+        subset_maps = self.used.setdefault(font.fname, [{}])
+        use_next_charmap = (
+            # Multi-character glyphs always go in the non-0 subset.
+            len(chars) > 1 or
+            # Default to preserving the character code as it was.
+            self.subset_size != 0
+            and (
+                # But start filling a new subset if outside the first block; this
+                # preserves ASCII (for Type 3) or the Basic Multilingual Plane (for
+                # Type 42).
+                charcode >= self.subset_size
+                # Or, use a new subset if the character code is already mapped for the
+                # first block. This means it's using an alternate glyph.
+                or charcode in subset_maps[0]
+            )
+        )
+        if use_next_charmap:
+            if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
+                subset_maps.append({})
+            subset = len(subset_maps) - 1
+            subset_charcode = len(subset_maps[-1])
         else:
             subset = 0
             subset_charcode = charcode
-        self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
+        subset_maps[subset][subset_charcode] = glyph
+        glyph_map.add(chars, glyph, subset, subset_charcode)
         return (subset, subset_charcode)
 
-    def subset_to_unicode(self, index: int,
-                          charcode: CharacterCodeType) -> CharacterCodeType:
+    def subset_to_unicode(self, fontname: str, subset: int,
+                          subset_charcode: CharacterCodeType) -> str:
         """
         Map a subset index and character code to a Unicode character code.
 
         Parameters
         ----------
-        index : int
+        fontname : str
+            The name of the font, from the *used* dictionary key.
+        subset : int
             The subset index within a font.
-        charcode : CharacterCodeType
+        subset_charcode : CharacterCodeType
             The character code within a subset to map back.
 
         Returns
         -------
-        CharacterCodeType
-            The Unicode character code corresponding to the subsetted one.
+        str
+            The Unicode character(s) corresponding to the subsetted character code.
         """
-        return index * self.subset_size + charcode
+        return self.glyph_maps[fontname].iget(subset, subset_charcode)[0]
 
 
 class RendererPDFPSBase(RendererBase):
diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
@@ -368,12 +368,6 @@ def pdfRepr(obj):
                         "objects")
 
 
-_FONT_MAX_GLYPH = {
-    3: 256,
-    42: 65536,
-}
-
-
 class Reference:
     """
     PDF reference object.
@@ -691,7 +685,7 @@ def __init__(self, filename, metadata=None):
         self._fontNames = {}     # maps filenames to internal font names
         self._dviFontInfo = {}   # maps pdf names to dvifonts
         self._character_tracker = _backend_pdf_ps.CharacterTracker(
-            _FONT_MAX_GLYPH.get(mpl.rcParams['pdf.fonttype'], 0))
+            _backend_pdf_ps._FONT_MAX_GLYPH.get(mpl.rcParams['ps.fonttype'], 0))
 
         self.alphaStates = {}   # maps alpha values to graphics state objects
         self._alpha_state_seq = (Name(f'A{i}') for i in itertools.count(1))
@@ -948,9 +942,8 @@ def writeFonts(self):
             else:
                 # a normal TrueType font
                 _log.debug('Writing TrueType font.')
-                charmap = self._character_tracker.used.get((filename, subset))
-                if charmap:
-                    fonts[Fx] = self.embedTTF(filename, subset, charmap)
+                charmap = self._character_tracker.used[filename][subset]
+                fonts[Fx] = self.embedTTF(filename, subset, charmap)
         self.writeObject(self.fontObject, fonts)
 
     def _write_afm_font(self, filename):
@@ -992,8 +985,12 @@ def _embedTeXFont(self, dvifont):
 
         # Reduce the font to only the glyphs used in the document, get the encoding
         # for that subset, and compute various properties based on the encoding.
-        charmap = self._character_tracker.used[(dvifont.fname, 0)]
-        chars = frozenset(charmap.keys())
+        charmap = self._character_tracker.used[dvifont.fname][0]
+        chars = {
+            # DVI type 1 fonts always map single glyph to single character.
+            ord(self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode))
+            for ccode in charmap
+        }
         t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
         fontdict['BaseFont'] = Name(t1font.prop['FontName'])
         # createType1Descriptor writes the font data as a side effect
@@ -1144,14 +1141,16 @@ def generate_unicode_cmap(subset_index, charmap):
                     unicode_groups[-1][1] = ccode
                 last_ccode = ccode
 
+            def _to_unicode(ccode):
+                chars = self._character_tracker.subset_to_unicode(
+                    filename, subset_index, ccode)
+                hexstr = chars.encode('utf-16be').hex()
+                return f'<{hexstr}>'
+
             width = 2 if fonttype == 3 else 4
             unicode_bfrange = []
             for start, end in unicode_groups:
-                real_start = self._character_tracker.subset_to_unicode(subset_index,
-                                                                       start)
-                real_end = self._character_tracker.subset_to_unicode(subset_index, end)
-                real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
-                                       for x in range(real_start, real_end+1))
+                real_values = ' '.join(_to_unicode(x) for x in range(start, end+1))
                 unicode_bfrange.append(
                     f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
             unicode_cmap = (self._identityToUnicodeCMap %
@@ -2325,7 +2324,7 @@ def output_singlebyte_chunk(kerns_or_chars):
             for item in _text_helpers.layout(s, font, kern_mode=Kerning.UNFITTED,
                                              language=language):
                 subset, charcode = self.file._character_tracker.track_glyph(
-                    item.ft_object, ord(item.char), item.glyph_index)
+                    item.ft_object, item.char, item.glyph_index)
                 if (item.ft_object, subset) != prev_font:
                     if singlebyte_chunk:
                         output_singlebyte_chunk(singlebyte_chunk)
diff --git a/lib/matplotlib/backends/backend_ps.py b/lib/matplotlib/backends/backend_ps.py