Skip to content

Commit f0666b3

Browse files
committed
pdf/ps: Compress subsetted font blocks
Instead of splitting fonts into `subset_size` blocks and writing text as character code modulo `subset_size`, compress the blocks by doing two things: 1. Preserve the character code if it lies in the first block. This keeps ASCII (for Type 3) and the Basic Multilingual Plane (for Type 42) as their normal codes. 2. Push everything else into the next spot in the next block, splitting by `subset_size` as necessary. This should reduce the number of additional font subsets to embed.
1 parent 5afd71b commit f0666b3

File tree

3 files changed

+73
-37
lines changed

3 files changed

+73
-37
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -114,16 +114,21 @@ class CharacterTracker:
114114
----------
115115
subset_size : int
116116
The size at which characters are grouped into subsets.
117-
used : dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]]
117+
used : dict
118118
A dictionary of font files to character maps.
119119
120-
The key is a font filename and subset within that font.
120+
The key is a font filename.
121121
122-
The value is a dictionary mapping a character code to a glyph index. Note this
123-
mapping is the inverse of FreeType, which maps glyph indices to character codes.
122+
The value is a list of dictionaries, each mapping at most *subset_size*
123+
character codes to glyph indices. Note this mapping is the inverse of FreeType,
124+
which maps glyph indices to character codes.
124125
125126
If *subset_size* is not set, then there will only be one subset per font
126127
filename.
128+
glyph_map : dict
129+
A dictionary of font files to glyph maps. The glyph map is from (character code,
130+
glyph index)-pairs to (subset index, subset character code)-pairs. You probably
131+
will want to use the `.subset_to_unicode` method instead of this attribute.
127132
"""
128133

129134
def __init__(self, subset_size: int = 0):
@@ -134,7 +139,10 @@ def __init__(self, subset_size: int = 0):
134139
The maximum size that is supported for an embedded font. If provided, then
135140
characters will be grouped into these sized subsets.
136141
"""
137-
self.used: dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]] = {}
142+
self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
143+
self.glyph_map: dict[str,
144+
dict[tuple[CharacterCodeType, GlyphIndexType],
145+
tuple[int, CharacterCodeType]]] = {}
138146
self.subset_size = subset_size
139147

140148
def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
@@ -186,22 +194,39 @@ def track_glyph(
186194
The character code within the above subset. If *subset_size* was not
187195
specified on this instance, then this is just *charcode* unmodified.
188196
"""
197+
glyph_map = self.glyph_map.setdefault(font.fname, {})
198+
key = (charcode, glyph)
199+
if key in glyph_map:
200+
return glyph_map[key]
201+
202+
subset_maps = self.used.setdefault(font.fname, [{}])
203+
# Default to preserving the character code as it was.
204+
subset = 0
205+
subset_charcode = charcode
206+
use_next_charmap = False
189207
if self.subset_size != 0:
190-
subset = charcode // self.subset_size
191-
subset_charcode = charcode % self.subset_size
192-
else:
193-
subset = 0
194-
subset_charcode = charcode
195-
self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
208+
# But start filling a new subset if outside the first block; this preserves
209+
# ASCII (for Type 3) or the Basic Multilingual Plane (for Type 42).
210+
if charcode >= self.subset_size:
211+
use_next_charmap = True
212+
if use_next_charmap:
213+
if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
214+
subset_maps.append({})
215+
subset = len(subset_maps) - 1
216+
subset_charcode = len(subset_maps[-1])
217+
subset_maps[subset][subset_charcode] = glyph
218+
glyph_map[key] = (subset, subset_charcode)
196219
return (subset, subset_charcode)
197220

198-
def subset_to_unicode(self, index: int,
221+
def subset_to_unicode(self, fontname: str, index: int,
199222
charcode: CharacterCodeType) -> CharacterCodeType:
200223
"""
201224
Map a subset index and character code to a Unicode character code.
202225
203226
Parameters
204227
----------
228+
fontname : str
229+
The name of the font, from the *used* dictionary key.
205230
index : int
206231
The subset index within a font.
207232
charcode : CharacterCodeType
@@ -212,7 +237,11 @@ def subset_to_unicode(self, index: int,
212237
CharacterCodeType
213238
The Unicode character code corresponding to the subsetted one.
214239
"""
215-
return index * self.subset_size + charcode
240+
search = (index, charcode)
241+
for orig_info, subset_info in self.glyph_map[fontname].items():
242+
if search == subset_info:
243+
return orig_info[0]
244+
raise ValueError(f'{charcode} does not exist in {fontname} subset {index}')
216245

217246

218247
class RendererPDFPSBase(RendererBase):

lib/matplotlib/backends/backend_pdf.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -948,9 +948,11 @@ def writeFonts(self):
948948
else:
949949
# a normal TrueType font
950950
_log.debug('Writing TrueType font.')
951-
charmap = self._character_tracker.used.get((filename, subset))
952-
if charmap:
953-
fonts[Fx] = self.embedTTF(filename, subset, charmap)
951+
charmaps = self._character_tracker.used.get(filename, [])
952+
if charmaps:
953+
charmap = charmaps[subset]
954+
if charmap:
955+
fonts[Fx] = self.embedTTF(filename, subset, charmap)
954956
self.writeObject(self.fontObject, fonts)
955957

956958
def _write_afm_font(self, filename):
@@ -992,8 +994,11 @@ def _embedTeXFont(self, dvifont):
992994

993995
# Reduce the font to only the glyphs used in the document, get the encoding
994996
# for that subset, and compute various properties based on the encoding.
995-
charmap = self._character_tracker.used[(dvifont.fname, 0)]
996-
chars = frozenset(charmap.keys())
997+
charmap = self._character_tracker.used[dvifont.fname][0]
998+
chars = {
999+
self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
1000+
for ccode in charmap
1001+
}
9971002
t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
9981003
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
9991004
# createType1Descriptor writes the font data as a side effect
@@ -1144,14 +1149,16 @@ def generate_unicode_cmap(subset_index, charmap):
11441149
unicode_groups[-1][1] = ccode
11451150
last_ccode = ccode
11461151

1152+
def _to_unicode(ccode):
1153+
real_ccode = self._character_tracker.subset_to_unicode(
1154+
filename, subset_index, ccode)
1155+
unicodestr = chr(real_ccode).encode('utf-16be').hex()
1156+
return f'<{unicodestr}>'
1157+
11471158
width = 2 if fonttype == 3 else 4
11481159
unicode_bfrange = []
11491160
for start, end in unicode_groups:
1150-
real_start = self._character_tracker.subset_to_unicode(subset_index,
1151-
start)
1152-
real_end = self._character_tracker.subset_to_unicode(subset_index, end)
1153-
real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
1154-
for x in range(real_start, real_end+1))
1161+
real_values = ' '.join(_to_unicode(x) for x in range(start, end+1))
11551162
unicode_bfrange.append(
11561163
f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
11571164
unicode_cmap = (self._identityToUnicodeCMap %

lib/matplotlib/backends/backend_ps.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,24 +1065,24 @@ def print_figure_impl(fh):
10651065
Ndict = len(_psDefs)
10661066
print("%%BeginProlog", file=fh)
10671067
if not mpl.rcParams['ps.useafm']:
1068-
Ndict += len(ps_renderer._character_tracker.used)
1068+
Ndict += sum(map(len, ps_renderer._character_tracker.used.values()), 0)
10691069
print("/mpldict %d dict def" % Ndict, file=fh)
10701070
print("mpldict begin", file=fh)
10711071
print("\n".join(_psDefs), file=fh)
10721072
if not mpl.rcParams['ps.useafm']:
1073-
for (font, subset_index), charmap in \
1074-
ps_renderer._character_tracker.used.items():
1075-
if not charmap:
1076-
continue
1077-
fonttype = mpl.rcParams['ps.fonttype']
1078-
# Can't use more than 255 chars from a single Type 3 font.
1079-
if len(charmap) > 255:
1080-
fonttype = 42
1081-
fh.flush()
1082-
if fonttype == 3:
1083-
fh.write(_font_to_ps_type3(font, charmap.values()))
1084-
else: # Type 42 only.
1085-
_font_to_ps_type42(font, charmap.values(), fh)
1073+
for font, subsets in ps_renderer._character_tracker.used.items():
1074+
for charmap in subsets:
1075+
if not charmap:
1076+
continue
1077+
fonttype = mpl.rcParams['ps.fonttype']
1078+
# Can't use more than 255 chars from a single Type 3 font.
1079+
if len(charmap) > 255:
1080+
fonttype = 42
1081+
fh.flush()
1082+
if fonttype == 3:
1083+
fh.write(_font_to_ps_type3(font, charmap.values()))
1084+
else: # Type 42 only.
1085+
_font_to_ps_type42(font, charmap.values(), fh)
10861086
print("end", file=fh)
10871087
print("%%EndProlog", file=fh)
10881088

0 commit comments

Comments
 (0)