pdf/ps: Compress subsetted font blocks

QuLogic · QuLogic · commit f0666b393dec · 2025-09-27T04:22:40.000-04:00
Instead of splitting fonts into `subset_size` blocks and writing text as
character code modulo `subset_size`, compress the blocks by doing two
things:

1. Preserve the character code if it lies in the first block. This keeps
   ASCII (for Type 3) and the Basic Multilingual Plane (for Type 42) as
   their normal codes.
2. Push everything else into the next spot in the next block, splitting
   by `subset_size` as necessary.

This should reduce the number of additional font subsets to embed.
diff --git a/lib/matplotlib/backends/_backend_pdf_ps.py b/lib/matplotlib/backends/_backend_pdf_ps.py
@@ -114,16 +114,21 @@ class CharacterTracker:
     ----------
     subset_size : int
         The size at which characters are grouped into subsets.
-    used : dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]]
+    used : dict
         A dictionary of font files to character maps.
 
-        The key is a font filename and subset within that font.
+        The key is a font filename.
 
-        The value is a dictionary mapping a character code to a glyph index. Note this
-        mapping is the inverse of FreeType, which maps glyph indices to character codes.
+        The value is a list of dictionaries, each mapping at most *subset_size*
+        character codes to glyph indices. Note this mapping is the inverse of FreeType,
+        which maps glyph indices to character codes.
 
         If *subset_size* is not set, then there will only be one subset per font
         filename.
+    glyph_map : dict
+        A dictionary of font files to glyph maps. The glyph map is from (character code,
+        glyph index)-pairs to (subset index, subset character code)-pairs. You probably
+        will want to use the `.subset_to_unicode` method instead of this attribute.
     """
 
     def __init__(self, subset_size: int = 0):
@@ -134,7 +139,10 @@ def __init__(self, subset_size: int = 0):
             The maximum size that is supported for an embedded font. If provided, then
             characters will be grouped into these sized subsets.
         """
-        self.used: dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]] = {}
+        self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
+        self.glyph_map: dict[str,
+                             dict[tuple[CharacterCodeType, GlyphIndexType],
+                                  tuple[int, CharacterCodeType]]] = {}
         self.subset_size = subset_size
 
     def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
@@ -186,22 +194,39 @@ def track_glyph(
             The character code within the above subset. If *subset_size* was not
             specified on this instance, then this is just *charcode* unmodified.
         """
+        glyph_map = self.glyph_map.setdefault(font.fname, {})
+        key = (charcode, glyph)
+        if key in glyph_map:
+            return glyph_map[key]
+
+        subset_maps = self.used.setdefault(font.fname, [{}])
+        # Default to preserving the character code as it was.
+        subset = 0
+        subset_charcode = charcode
+        use_next_charmap = False
         if self.subset_size != 0:
-            subset = charcode // self.subset_size
-            subset_charcode = charcode % self.subset_size
-        else:
-            subset = 0
-            subset_charcode = charcode
-        self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
+            # But start filling a new subset if outside the first block; this preserves
+            # ASCII (for Type 3) or the Basic Multilingual Plane (for Type 42).
+            if charcode >= self.subset_size:
+                use_next_charmap = True
+        if use_next_charmap:
+            if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
+                subset_maps.append({})
+            subset = len(subset_maps) - 1
+            subset_charcode = len(subset_maps[-1])
+        subset_maps[subset][subset_charcode] = glyph
+        glyph_map[key] = (subset, subset_charcode)
         return (subset, subset_charcode)
 
-    def subset_to_unicode(self, index: int,
+    def subset_to_unicode(self, fontname: str, index: int,
                           charcode: CharacterCodeType) -> CharacterCodeType:
         """
         Map a subset index and character code to a Unicode character code.
 
         Parameters
         ----------
+        fontname : str
+            The name of the font, from the *used* dictionary key.
         index : int
             The subset index within a font.
         charcode : CharacterCodeType
@@ -212,7 +237,11 @@ def subset_to_unicode(self, index: int,
         CharacterCodeType
             The Unicode character code corresponding to the subsetted one.
         """
-        return index * self.subset_size + charcode
+        search = (index, charcode)
+        for orig_info, subset_info in self.glyph_map[fontname].items():
+            if search == subset_info:
+                return orig_info[0]
+        raise ValueError(f'{charcode} does not exist in {fontname} subset {index}')
 
 
 class RendererPDFPSBase(RendererBase):
diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
@@ -948,9 +948,11 @@ def writeFonts(self):
             else:
                 # a normal TrueType font
                 _log.debug('Writing TrueType font.')
-                charmap = self._character_tracker.used.get((filename, subset))
-                if charmap:
-                    fonts[Fx] = self.embedTTF(filename, subset, charmap)
+                charmaps = self._character_tracker.used.get(filename, [])
+                if charmaps:
+                    charmap = charmaps[subset]
+                    if charmap:
+                        fonts[Fx] = self.embedTTF(filename, subset, charmap)
         self.writeObject(self.fontObject, fonts)
 
     def _write_afm_font(self, filename):
@@ -992,8 +994,11 @@ def _embedTeXFont(self, dvifont):
 
         # Reduce the font to only the glyphs used in the document, get the encoding
         # for that subset, and compute various properties based on the encoding.
-        charmap = self._character_tracker.used[(dvifont.fname, 0)]
-        chars = frozenset(charmap.keys())
+        charmap = self._character_tracker.used[dvifont.fname][0]
+        chars = {
+            self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
+            for ccode in charmap
+        }
         t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
         fontdict['BaseFont'] = Name(t1font.prop['FontName'])
         # createType1Descriptor writes the font data as a side effect
@@ -1144,14 +1149,16 @@ def generate_unicode_cmap(subset_index, charmap):
                     unicode_groups[-1][1] = ccode
                 last_ccode = ccode
 
+            def _to_unicode(ccode):
+                real_ccode = self._character_tracker.subset_to_unicode(
+                    filename, subset_index, ccode)
+                unicodestr = chr(real_ccode).encode('utf-16be').hex()
+                return f'<{unicodestr}>'
+
             width = 2 if fonttype == 3 else 4
             unicode_bfrange = []
             for start, end in unicode_groups:
-                real_start = self._character_tracker.subset_to_unicode(subset_index,
-                                                                       start)
-                real_end = self._character_tracker.subset_to_unicode(subset_index, end)
-                real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
-                                       for x in range(real_start, real_end+1))
+                real_values = ' '.join(_to_unicode(x) for x in range(start, end+1))
                 unicode_bfrange.append(
                     f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
             unicode_cmap = (self._identityToUnicodeCMap %
diff --git a/lib/matplotlib/backends/backend_ps.py b/lib/matplotlib/backends/backend_ps.py
@@ -1065,24 +1065,24 @@ def print_figure_impl(fh):
             Ndict = len(_psDefs)
             print("%%BeginProlog", file=fh)
             if not mpl.rcParams['ps.useafm']:
-                Ndict += len(ps_renderer._character_tracker.used)
+                Ndict += sum(map(len, ps_renderer._character_tracker.used.values()), 0)
             print("/mpldict %d dict def" % Ndict, file=fh)
             print("mpldict begin", file=fh)
             print("\n".join(_psDefs), file=fh)
             if not mpl.rcParams['ps.useafm']:
-                for (font, subset_index), charmap in \
-                        ps_renderer._character_tracker.used.items():
-                    if not charmap:
-                        continue
-                    fonttype = mpl.rcParams['ps.fonttype']
-                    # Can't use more than 255 chars from a single Type 3 font.
-                    if len(charmap) > 255:
-                        fonttype = 42
-                    fh.flush()
-                    if fonttype == 3:
-                        fh.write(_font_to_ps_type3(font, charmap.values()))
-                    else:  # Type 42 only.
-                        _font_to_ps_type42(font, charmap.values(), fh)
+                for font, subsets in ps_renderer._character_tracker.used.items():
+                    for charmap in subsets:
+                        if not charmap:
+                            continue
+                        fonttype = mpl.rcParams['ps.fonttype']
+                        # Can't use more than 255 chars from a single Type 3 font.
+                        if len(charmap) > 255:
+                            fonttype = 42
+                        fh.flush()
+                        if fonttype == 3:
+                            fh.write(_font_to_ps_type3(font, charmap.values()))
+                        else:  # Type 42 only.
+                            _font_to_ps_type42(font, charmap.values(), fh)
             print("end", file=fh)
             print("%%EndProlog", file=fh)