pdf: Support multi-character glyphs when subsetting

QuLogic · QuLogic · commit 662ac58c4980 · 2025-09-29T20:25:53.000-04:00
For ligatures or complex shapings, multiple characters may map to a
single glyph. In this case, we still want to output a single character
code for the string using the font subset, but the `ToUnicode` map
should give back all the characters.
diff --git a/lib/matplotlib/backends/_backend_pdf_ps.py b/lib/matplotlib/backends/_backend_pdf_ps.py
@@ -107,11 +107,11 @@ class GlyphMap:
     """
     A two-way glyph mapping.
 
-    The forward glyph map is from (character code, glyph index)-pairs to (subset index,
-    subset character code)-pairs.
+    The forward glyph map is from (character string, glyph index)-pairs to
+    (subset index, subset character code)-pairs.
 
     The inverse glyph map is from to (subset index, subset character code)-pairs to
-    (character code, glyph index)-pairs.
+    (character string, glyph index)-pairs.
     """
 
     def __init__(self) -> None:
@@ -120,22 +120,21 @@ def __init__(self) -> None:
         self._inverse: dict[tuple[int, CharacterCodeType],
                             tuple[CharacterCodeType, GlyphIndexType]] = {}
 
-    def get(self, charcode: CharacterCodeType,
+    def get(self, charcodes: str,
             glyph_index: GlyphIndexType) -> tuple[int, CharacterCodeType] | None:
         """
-        Get the forward mapping from a (character code, glyph index)-pair.
+        Get the forward mapping from a (character string, glyph index)-pair.
 
         This may return *None* if the pair is not currently mapped.
         """
-        return self._forward.get((charcode, glyph_index))
+        return self._forward.get((charcodes, glyph_index))
 
     def iget(self, subset: int,
-             subset_charcode: CharacterCodeType) -> tuple[CharacterCodeType,
-                                                          GlyphIndexType]:
+             subset_charcode: CharacterCodeType) -> tuple[str, GlyphIndexType]:
         """Get the inverse mapping from a (subset, subset charcode)-pair."""
         return self._inverse[(subset, subset_charcode)]
 
-    def add(self, charcode: CharacterCodeType, glyph_index: GlyphIndexType, subset: int,
+    def add(self, charcode: str, glyph_index: GlyphIndexType, subset: int,
             subset_charcode: CharacterCodeType) -> None:
         """
         Add a mapping to this instance.
@@ -219,18 +218,19 @@ def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
             for c, f in font._get_fontmap(s).items()
         ]
 
-    def track_glyph(
-            self, font: FT2Font, charcode: CharacterCodeType,
-            glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
+    def track_glyph(self, font: FT2Font, chars: str | CharacterCodeType,
+                    glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
         """
         Record character code *charcode* at glyph index *glyph* as using font *font*.
 
         Parameters
         ----------
         font : FT2Font
             A font that is being used for the provided string.
-        charcode : CharacterCodeType
-            The character code to record.
+        chars : str or CharacterCodeType
+            The character(s) to record. This may be a single character code, or multiple
+            characters in a string, if the glyph maps to several characters. It will be
+            normalized to a string internally.
         glyph : GlyphIndexType
             The corresponding glyph index to record.
 
@@ -243,13 +243,21 @@ def track_glyph(
             The character code within the above subset. If *subset_size* was not
             specified on this instance, then this is just *charcode* unmodified.
         """
+        if isinstance(chars, str):
+            charcode = ord(chars[0])
+        else:
+            charcode = chars
+            chars = chr(chars)
+
         glyph_map = self.glyph_maps.setdefault(font.fname, GlyphMap())
-        if result := glyph_map.get(charcode, glyph):
+        if result := glyph_map.get(chars, glyph):
             return result
 
         subset_maps = self.used.setdefault(font.fname, [{}])
-        # Default to preserving the character code as it was.
         use_next_charmap = (
+            # Multi-character glyphs always go in the non-0 subset.
+            len(chars) > 1 or
+            # Default to preserving the character code as it was.
             self.subset_size != 0
             and (
                 # But start filling a new subset if outside the first block; this
@@ -270,11 +278,11 @@ def track_glyph(
             subset = 0
             subset_charcode = charcode
         subset_maps[subset][subset_charcode] = glyph
-        glyph_map.add(charcode, glyph, subset, subset_charcode)
+        glyph_map.add(chars, glyph, subset, subset_charcode)
         return (subset, subset_charcode)
 
     def subset_to_unicode(self, fontname: str, subset: int,
-                          subset_charcode: CharacterCodeType) -> CharacterCodeType:
+                          subset_charcode: CharacterCodeType) -> str:
         """
         Map a subset index and character code to a Unicode character code.
 
@@ -289,8 +297,8 @@ def subset_to_unicode(self, fontname: str, subset: int,
 
         Returns
         -------
-        CharacterCodeType
-            The Unicode character code corresponding to the subsetted one.
+        str
+            The Unicode character(s) corresponding to the subsetted character code.
         """
         return self.glyph_maps[fontname].iget(subset, subset_charcode)[0]
 
diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
@@ -996,7 +996,8 @@ def _embedTeXFont(self, dvifont):
         # for that subset, and compute various properties based on the encoding.
         charmap = self._character_tracker.used[dvifont.fname][0]
         chars = {
-            self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
+            # DVI fonts always map single glyph to single character.
+            ord(self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode))
             for ccode in charmap
         }
         t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
@@ -1150,10 +1151,10 @@ def generate_unicode_cmap(subset_index, charmap):
                 last_ccode = ccode
 
             def _to_unicode(ccode):
-                real_ccode = self._character_tracker.subset_to_unicode(
+                chars = self._character_tracker.subset_to_unicode(
                     filename, subset_index, ccode)
-                unicodestr = chr(real_ccode).encode('utf-16be').hex()
-                return f'<{unicodestr}>'
+                hexstr = chars.encode('utf-16be').hex()
+                return f'<{hexstr}>'
 
             width = 2 if fonttype == 3 else 4
             unicode_bfrange = []
@@ -2332,7 +2333,7 @@ def output_singlebyte_chunk(kerns_or_chars):
             for item in _text_helpers.layout(s, font, kern_mode=Kerning.UNFITTED,
                                              language=language):
                 subset, charcode = self.file._character_tracker.track_glyph(
-                    item.ft_object, ord(item.char), item.glyph_index)
+                    item.ft_object, item.char, item.glyph_index)
                 if (item.ft_object, subset) != prev_font:
                     if singlebyte_chunk:
                         output_singlebyte_chunk(singlebyte_chunk)