Skip to content

Commit 8274e17

Browse files
committed
pdf/ps: Compress subsetted font blocks
Instead of splitting fonts into `subset_size` blocks and writing text as character code modulo `subset_size`, compress the blocks by doing two things: 1. Preserve the character code if it lies in the first block. This keeps ASCII (for Type 3) and the Basic Multilingual Plane (for Type 42) as their normal codes. 2. Push everything else into the next spot in the next block, splitting by `subset_size` as necessary. This should reduce the number of additional font subsets to embed.
1 parent 50f76ff commit 8274e17

File tree

3 files changed

+116
-38
lines changed

3 files changed

+116
-38
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,58 @@ def font_as_file(font):
103103
return fh
104104

105105

106+
class GlyphMap:
107+
"""
108+
A two-way glyph mapping.
109+
110+
The forward glyph map is from (character code, glyph index)-pairs to (subset index,
111+
subset character code)-pairs.
112+
113+
The inverse glyph map is from to (subset index, subset character code)-pairs to
114+
(character code, glyph index)-pairs.
115+
"""
116+
117+
def __init__(self) -> None:
118+
self._forward: dict[tuple[CharacterCodeType, GlyphIndexType],
119+
tuple[int, CharacterCodeType]] = {}
120+
self._inverse: dict[tuple[int, CharacterCodeType],
121+
tuple[CharacterCodeType, GlyphIndexType]] = {}
122+
123+
def get(self, charcode: CharacterCodeType,
124+
glyph_index: GlyphIndexType) -> tuple[int, CharacterCodeType] | None:
125+
"""
126+
Get the forward mapping from a (character code, glyph index)-pair.
127+
128+
This may return *None* if the pair is not currently mapped.
129+
"""
130+
return self._forward.get((charcode, glyph_index))
131+
132+
def iget(self, subset: int,
133+
subset_charcode: CharacterCodeType) -> tuple[CharacterCodeType,
134+
GlyphIndexType]:
135+
"""Get the inverse mapping from a (subset, subset charcode)-pair."""
136+
return self._inverse[(subset, subset_charcode)]
137+
138+
def add(self, charcode: CharacterCodeType, glyph_index: GlyphIndexType, subset: int,
139+
subset_charcode: CharacterCodeType) -> None:
140+
"""
141+
Add a mapping to this instance.
142+
143+
Parameters
144+
----------
145+
charcode : CharacterCodeType
146+
The character code to record.
147+
glyph : GlyphIndexType
148+
The corresponding glyph index to record.
149+
subset : int
150+
The subset in which the subset character code resides.
151+
subset_charcode : CharacterCodeType
152+
The subset character code within the above subset.
153+
"""
154+
self._forward[(charcode, glyph_index)] = (subset, subset_charcode)
155+
self._inverse[(subset, subset_charcode)] = (charcode, glyph_index)
156+
157+
106158
class CharacterTracker:
107159
"""
108160
Helper for font subsetting by the PDF and PS backends.
@@ -114,16 +166,20 @@ class CharacterTracker:
114166
----------
115167
subset_size : int
116168
The size at which characters are grouped into subsets.
117-
used : dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]]
169+
used : dict
118170
A dictionary of font files to character maps.
119171
120-
The key is a font filename and subset within that font.
172+
The key is a font filename.
121173
122-
The value is a dictionary mapping a character code to a glyph index. Note this
123-
mapping is the inverse of FreeType, which maps glyph indices to character codes.
174+
The value is a list of dictionaries, each mapping at most *subset_size*
175+
character codes to glyph indices. Note this mapping is the inverse of FreeType,
176+
which maps glyph indices to character codes.
124177
125178
If *subset_size* is not set, then there will only be one subset per font
126179
filename.
180+
glyph_maps : dict
181+
A dictionary of font files to glyph maps. You probably will want to use the
182+
`.subset_to_unicode` method instead of this attribute.
127183
"""
128184

129185
def __init__(self, subset_size: int = 0):
@@ -134,7 +190,8 @@ def __init__(self, subset_size: int = 0):
134190
The maximum size that is supported for an embedded font. If provided, then
135191
characters will be grouped into these sized subsets.
136192
"""
137-
self.used: dict[tuple[str, int], dict[CharacterCodeType, GlyphIndexType]] = {}
193+
self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
194+
self.glyph_maps: dict[str, GlyphMap] = {}
138195
self.subset_size = subset_size
139196

140197
def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
@@ -186,33 +243,50 @@ def track_glyph(
186243
The character code within the above subset. If *subset_size* was not
187244
specified on this instance, then this is just *charcode* unmodified.
188245
"""
189-
if self.subset_size != 0:
190-
subset = charcode // self.subset_size
191-
subset_charcode = charcode % self.subset_size
246+
glyph_map = self.glyph_maps.setdefault(font.fname, GlyphMap())
247+
if result := glyph_map.get(charcode, glyph):
248+
return result
249+
250+
subset_maps = self.used.setdefault(font.fname, [{}])
251+
# Default to preserving the character code as it was.
252+
use_next_charmap = (
253+
self.subset_size != 0
254+
# But start filling a new subset if outside the first block; this preserves
255+
# ASCII (for Type 3) or the Basic Multilingual Plane (for Type 42).
256+
and charcode >= self.subset_size
257+
)
258+
if use_next_charmap:
259+
if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
260+
subset_maps.append({})
261+
subset = len(subset_maps) - 1
262+
subset_charcode = len(subset_maps[-1])
192263
else:
193264
subset = 0
194265
subset_charcode = charcode
195-
self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
266+
subset_maps[subset][subset_charcode] = glyph
267+
glyph_map.add(charcode, glyph, subset, subset_charcode)
196268
return (subset, subset_charcode)
197269

198-
def subset_to_unicode(self, index: int,
199-
charcode: CharacterCodeType) -> CharacterCodeType:
270+
def subset_to_unicode(self, fontname: str, subset: int,
271+
subset_charcode: CharacterCodeType) -> CharacterCodeType:
200272
"""
201273
Map a subset index and character code to a Unicode character code.
202274
203275
Parameters
204276
----------
205-
index : int
277+
fontname : str
278+
The name of the font, from the *used* dictionary key.
279+
subset : int
206280
The subset index within a font.
207-
charcode : CharacterCodeType
281+
subset_charcode : CharacterCodeType
208282
The character code within a subset to map back.
209283
210284
Returns
211285
-------
212286
CharacterCodeType
213287
The Unicode character code corresponding to the subsetted one.
214288
"""
215-
return index * self.subset_size + charcode
289+
return self.glyph_maps[fontname].iget(subset, subset_charcode)[0]
216290

217291

218292
class RendererPDFPSBase(RendererBase):

lib/matplotlib/backends/backend_pdf.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -948,9 +948,8 @@ def writeFonts(self):
948948
else:
949949
# a normal TrueType font
950950
_log.debug('Writing TrueType font.')
951-
charmap = self._character_tracker.used.get((filename, subset))
952-
if charmap:
953-
fonts[Fx] = self.embedTTF(filename, subset, charmap)
951+
charmap = self._character_tracker.used[filename][subset]
952+
fonts[Fx] = self.embedTTF(filename, subset, charmap)
954953
self.writeObject(self.fontObject, fonts)
955954

956955
def _write_afm_font(self, filename):
@@ -992,8 +991,11 @@ def _embedTeXFont(self, dvifont):
992991

993992
# Reduce the font to only the glyphs used in the document, get the encoding
994993
# for that subset, and compute various properties based on the encoding.
995-
charmap = self._character_tracker.used[(dvifont.fname, 0)]
996-
chars = frozenset(charmap.keys())
994+
charmap = self._character_tracker.used[dvifont.fname][0]
995+
chars = {
996+
self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
997+
for ccode in charmap
998+
}
997999
t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
9981000
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
9991001
# createType1Descriptor writes the font data as a side effect
@@ -1144,14 +1146,16 @@ def generate_unicode_cmap(subset_index, charmap):
11441146
unicode_groups[-1][1] = ccode
11451147
last_ccode = ccode
11461148

1149+
def _to_unicode(ccode):
1150+
real_ccode = self._character_tracker.subset_to_unicode(
1151+
filename, subset_index, ccode)
1152+
unicodestr = chr(real_ccode).encode('utf-16be').hex()
1153+
return f'<{unicodestr}>'
1154+
11471155
width = 2 if fonttype == 3 else 4
11481156
unicode_bfrange = []
11491157
for start, end in unicode_groups:
1150-
real_start = self._character_tracker.subset_to_unicode(subset_index,
1151-
start)
1152-
real_end = self._character_tracker.subset_to_unicode(subset_index, end)
1153-
real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
1154-
for x in range(real_start, real_end+1))
1158+
real_values = ' '.join(_to_unicode(x) for x in range(start, end+1))
11551159
unicode_bfrange.append(
11561160
f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
11571161
unicode_cmap = (self._identityToUnicodeCMap %

lib/matplotlib/backends/backend_ps.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,24 +1065,24 @@ def print_figure_impl(fh):
10651065
Ndict = len(_psDefs)
10661066
print("%%BeginProlog", file=fh)
10671067
if not mpl.rcParams['ps.useafm']:
1068-
Ndict += len(ps_renderer._character_tracker.used)
1068+
Ndict += sum(map(len, ps_renderer._character_tracker.used.values()))
10691069
print("/mpldict %d dict def" % Ndict, file=fh)
10701070
print("mpldict begin", file=fh)
10711071
print("\n".join(_psDefs), file=fh)
10721072
if not mpl.rcParams['ps.useafm']:
1073-
for (font, subset_index), charmap in \
1074-
ps_renderer._character_tracker.used.items():
1075-
if not charmap:
1076-
continue
1077-
fonttype = mpl.rcParams['ps.fonttype']
1078-
# Can't use more than 255 chars from a single Type 3 font.
1079-
if len(charmap) > 255:
1080-
fonttype = 42
1081-
fh.flush()
1082-
if fonttype == 3:
1083-
fh.write(_font_to_ps_type3(font, charmap.values()))
1084-
else: # Type 42 only.
1085-
_font_to_ps_type42(font, charmap.values(), fh)
1073+
for font, subsets in ps_renderer._character_tracker.used.items():
1074+
for charmap in subsets:
1075+
if not charmap:
1076+
continue
1077+
fonttype = mpl.rcParams['ps.fonttype']
1078+
# Can't use more than 255 chars from a single Type 3 font.
1079+
if len(charmap) > 255:
1080+
fonttype = 42
1081+
fh.flush()
1082+
if fonttype == 3:
1083+
fh.write(_font_to_ps_type3(font, charmap.values()))
1084+
else: # Type 42 only.
1085+
_font_to_ps_type42(font, charmap.values(), fh)
10861086
print("end", file=fh)
10871087
print("%%EndProlog", file=fh)
10881088

0 commit comments

Comments
 (0)