pdf/ps: Compress subsetted font blocks

QuLogic · QuLogic · commit 8274e1733a0d · 2025-09-30T00:57:28.000-04:00
Instead of splitting fonts into `subset_size` blocks and writing text as
character code modulo `subset_size`, compress the blocks by doing two
things:

1. Preserve the character code if it lies in the first block. This keeps
   ASCII (for Type 3) and the Basic Multilingual Plane (for Type 42) as
   their normal codes.
2. Push everything else into the next spot in the next block, splitting
   by `subset_size` as necessary.

This should reduce the number of additional font subsets to embed.
diff --git a/lib/matplotlib/backends/_backend_pdf_ps.py b/lib/matplotlib/backends/_backend_pdf_ps.py
@@ -103,6 +103,58 @@ def font_as_file(font):
     return fh
 
 
+class GlyphMap:
+    """
+    A two-way glyph mapping.
+
+    The forward glyph map is from (character code, glyph index)-pairs to (subset index,
+    subset character code)-pairs.
+
+    The inverse glyph map is from to (subset index, subset character code)-pairs to
+    (character code, glyph index)-pairs.
+    """
+
+    def __init__(self) -> None:
+        self._forward: dict[tuple[CharacterCodeType, GlyphIndexType],
+                            tuple[int, CharacterCodeType]] = {}
+        self._inverse: dict[tuple[int, CharacterCodeType],
+                            tuple[CharacterCodeType, GlyphIndexType]] = {}
+
+    def get(self, charcode: CharacterCodeType,
+            glyph_index: GlyphIndexType) -> tuple[int, CharacterCodeType] | None:
+        """
+        Get the forward mapping from a (character code, glyph index)-pair.
+
+        This may return *None* if the pair is not currently mapped.
+        """
+        return self._forward.get((charcode, glyph_index))
+
+    def iget(self, subset: int,
+             subset_charcode: CharacterCodeType) -> tuple[CharacterCodeType,
+                                                          GlyphIndexType]:
+        """Get the inverse mapping from a (subset, subset charcode)-pair."""
+        return self._inverse[(subset, subset_charcode)]
+
+    def add(self, charcode: CharacterCodeType, glyph_index: GlyphIndexType, subset: int,
+            subset_charcode: CharacterCodeType) -> None:
+        """
+        Add a mapping to this instance.
+
+        Parameters
+        ----------
+        charcode : CharacterCodeType
+            The character code to record.
+        glyph : GlyphIndexType
+            The corresponding glyph index to record.
+        subset : int
+            The subset in which the subset character code resides.
+        subset_charcode : CharacterCodeType
+            The subset character code within the above subset.
+        """
+        self._forward[(charcode, glyph_index)] = (subset, subset_charcode)
+        self._inverse[(subset, subset_charcode)] = (charcode, glyph_index)
+
+
 class CharacterTracker:
     """
     Helper for font subsetting by the PDF and PS backends.
@@ -114,16 +166,20 @@ class CharacterTracker:
     ----------
     subset_size : int
         The size at which characters are grouped into subsets.
-    used : dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]]
+    used : dict
         A dictionary of font files to character maps.
 
-        The key is a font filename and subset within that font.
+        The key is a font filename.
 
-        The value is a dictionary mapping a character code to a glyph index. Note this
-        mapping is the inverse of FreeType, which maps glyph indices to character codes.
+        The value is a list of dictionaries, each mapping at most *subset_size*
+        character codes to glyph indices. Note this mapping is the inverse of FreeType,
+        which maps glyph indices to character codes.
 
         If *subset_size* is not set, then there will only be one subset per font
         filename.
+    glyph_maps : dict
+        A dictionary of font files to glyph maps. You probably will want to use the
+        `.subset_to_unicode` method instead of this attribute.
     """
 
     def __init__(self, subset_size: int = 0):
@@ -134,7 +190,8 @@ def __init__(self, subset_size: int = 0):
             The maximum size that is supported for an embedded font. If provided, then
             characters will be grouped into these sized subsets.
         """
-        self.used: dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]] = {}
+        self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
+        self.glyph_maps: dict[str, GlyphMap] = {}
         self.subset_size = subset_size
 
     def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
@@ -186,33 +243,50 @@ def track_glyph(
             The character code within the above subset. If *subset_size* was not
             specified on this instance, then this is just *charcode* unmodified.
         """
-        if self.subset_size != 0:
-            subset = charcode // self.subset_size
-            subset_charcode = charcode % self.subset_size
+        glyph_map = self.glyph_maps.setdefault(font.fname, GlyphMap())
+        if result := glyph_map.get(charcode, glyph):
+            return result
+
+        subset_maps = self.used.setdefault(font.fname, [{}])
+        # Default to preserving the character code as it was.
+        use_next_charmap = (
+            self.subset_size != 0
+            # But start filling a new subset if outside the first block; this preserves
+            # ASCII (for Type 3) or the Basic Multilingual Plane (for Type 42).
+            and charcode >= self.subset_size
+        )
+        if use_next_charmap:
+            if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
+                subset_maps.append({})
+            subset = len(subset_maps) - 1
+            subset_charcode = len(subset_maps[-1])
         else:
             subset = 0
             subset_charcode = charcode
-        self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
+        subset_maps[subset][subset_charcode] = glyph
+        glyph_map.add(charcode, glyph, subset, subset_charcode)
         return (subset, subset_charcode)
 
-    def subset_to_unicode(self, index: int,
-                          charcode: CharacterCodeType) -> CharacterCodeType:
+    def subset_to_unicode(self, fontname: str, subset: int,
+                          subset_charcode: CharacterCodeType) -> CharacterCodeType:
         """
         Map a subset index and character code to a Unicode character code.
 
         Parameters
         ----------
-        index : int
+        fontname : str
+            The name of the font, from the *used* dictionary key.
+        subset : int
             The subset index within a font.
-        charcode : CharacterCodeType
+        subset_charcode : CharacterCodeType
             The character code within a subset to map back.
 
         Returns
         -------
         CharacterCodeType
             The Unicode character code corresponding to the subsetted one.
         """
-        return index * self.subset_size + charcode
+        return self.glyph_maps[fontname].iget(subset, subset_charcode)[0]
 
 
 class RendererPDFPSBase(RendererBase):
diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
@@ -948,9 +948,8 @@ def writeFonts(self):
             else:
                 # a normal TrueType font
                 _log.debug('Writing TrueType font.')
-                charmap = self._character_tracker.used.get((filename, subset))
-                if charmap:
-                    fonts[Fx] = self.embedTTF(filename, subset, charmap)
+                charmap = self._character_tracker.used[filename][subset]
+                fonts[Fx] = self.embedTTF(filename, subset, charmap)
         self.writeObject(self.fontObject, fonts)
 
     def _write_afm_font(self, filename):
@@ -992,8 +991,11 @@ def _embedTeXFont(self, dvifont):
 
         # Reduce the font to only the glyphs used in the document, get the encoding
         # for that subset, and compute various properties based on the encoding.
-        charmap = self._character_tracker.used[(dvifont.fname, 0)]
-        chars = frozenset(charmap.keys())
+        charmap = self._character_tracker.used[dvifont.fname][0]
+        chars = {
+            self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
+            for ccode in charmap
+        }
         t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
         fontdict['BaseFont'] = Name(t1font.prop['FontName'])
         # createType1Descriptor writes the font data as a side effect
@@ -1144,14 +1146,16 @@ def generate_unicode_cmap(subset_index, charmap):
                     unicode_groups[-1][1] = ccode
                 last_ccode = ccode
 
+            def _to_unicode(ccode):
+                real_ccode = self._character_tracker.subset_to_unicode(
+                    filename, subset_index, ccode)
+                unicodestr = chr(real_ccode).encode('utf-16be').hex()
+                return f'<{unicodestr}>'
+
             width = 2 if fonttype == 3 else 4
             unicode_bfrange = []
             for start, end in unicode_groups:
-                real_start = self._character_tracker.subset_to_unicode(subset_index,
-                                                                       start)
-                real_end = self._character_tracker.subset_to_unicode(subset_index, end)
-                real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
-                                       for x in range(real_start, real_end+1))
+                real_values = ' '.join(_to_unicode(x) for x in range(start, end+1))
                 unicode_bfrange.append(
                     f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
             unicode_cmap = (self._identityToUnicodeCMap %
diff --git a/lib/matplotlib/backends/backend_ps.py b/lib/matplotlib/backends/backend_ps.py
@@ -1065,24 +1065,24 @@ def print_figure_impl(fh):
             Ndict = len(_psDefs)
             print("%%BeginProlog", file=fh)
             if not mpl.rcParams['ps.useafm']:
-                Ndict += len(ps_renderer._character_tracker.used)
+                Ndict += sum(map(len, ps_renderer._character_tracker.used.values()))
             print("/mpldict %d dict def" % Ndict, file=fh)
             print("mpldict begin", file=fh)
             print("\n".join(_psDefs), file=fh)
             if not mpl.rcParams['ps.useafm']:
-                for (font, subset_index), charmap in \
-                        ps_renderer._character_tracker.used.items():
-                    if not charmap:
-                        continue
-                    fonttype = mpl.rcParams['ps.fonttype']
-                    # Can't use more than 255 chars from a single Type 3 font.
-                    if len(charmap) > 255:
-                        fonttype = 42
-                    fh.flush()
-                    if fonttype == 3:
-                        fh.write(_font_to_ps_type3(font, charmap.values()))
-                    else:  # Type 42 only.
-                        _font_to_ps_type42(font, charmap.values(), fh)
+                for font, subsets in ps_renderer._character_tracker.used.items():
+                    for charmap in subsets:
+                        if not charmap:
+                            continue
+                        fonttype = mpl.rcParams['ps.fonttype']
+                        # Can't use more than 255 chars from a single Type 3 font.
+                        if len(charmap) > 255:
+                            fonttype = 42
+                        fh.flush()
+                        if fonttype == 3:
+                            fh.write(_font_to_ps_type3(font, charmap.values()))
+                        else:  # Type 42 only.
+                            _font_to_ps_type42(font, charmap.values(), fh)
             print("end", file=fh)
             print("%%EndProlog", file=fh)