Skip to content

Commit 662ac58

Browse files
committed
pdf: Support multi-character glyphs when subsetting
For ligatures or complex shapings, multiple characters may map to a single glyph. In this case, we still want to output a single character code for the string using the font subset, but the `ToUnicode` map should give back all the characters.
1 parent 11f8d5d commit 662ac58

File tree

2 files changed

+34
-25
lines changed

2 files changed

+34
-25
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,11 @@ class GlyphMap:
107107
"""
108108
A two-way glyph mapping.
109109
110-
The forward glyph map is from (character code, glyph index)-pairs to (subset index,
111-
subset character code)-pairs.
110+
The forward glyph map is from (character string, glyph index)-pairs to
111+
(subset index, subset character code)-pairs.
112112
113113
The inverse glyph map is from to (subset index, subset character code)-pairs to
114-
(character code, glyph index)-pairs.
114+
(character string, glyph index)-pairs.
115115
"""
116116

117117
def __init__(self) -> None:
@@ -120,22 +120,21 @@ def __init__(self) -> None:
120120
self._inverse: dict[tuple[int, CharacterCodeType],
121121
tuple[CharacterCodeType, GlyphIndexType]] = {}
122122

123-
def get(self, charcode: CharacterCodeType,
123+
def get(self, charcodes: str,
124124
glyph_index: GlyphIndexType) -> tuple[int, CharacterCodeType] | None:
125125
"""
126-
Get the forward mapping from a (character code, glyph index)-pair.
126+
Get the forward mapping from a (character string, glyph index)-pair.
127127
128128
This may return *None* if the pair is not currently mapped.
129129
"""
130-
return self._forward.get((charcode, glyph_index))
130+
return self._forward.get((charcodes, glyph_index))
131131

132132
def iget(self, subset: int,
133-
subset_charcode: CharacterCodeType) -> tuple[CharacterCodeType,
134-
GlyphIndexType]:
133+
subset_charcode: CharacterCodeType) -> tuple[str, GlyphIndexType]:
135134
"""Get the inverse mapping from a (subset, subset charcode)-pair."""
136135
return self._inverse[(subset, subset_charcode)]
137136

138-
def add(self, charcode: CharacterCodeType, glyph_index: GlyphIndexType, subset: int,
137+
def add(self, charcode: str, glyph_index: GlyphIndexType, subset: int,
139138
subset_charcode: CharacterCodeType) -> None:
140139
"""
141140
Add a mapping to this instance.
@@ -219,18 +218,19 @@ def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
219218
for c, f in font._get_fontmap(s).items()
220219
]
221220

222-
def track_glyph(
223-
self, font: FT2Font, charcode: CharacterCodeType,
224-
glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
221+
def track_glyph(self, font: FT2Font, chars: str | CharacterCodeType,
222+
glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
225223
"""
226224
Record character code *charcode* at glyph index *glyph* as using font *font*.
227225
228226
Parameters
229227
----------
230228
font : FT2Font
231229
A font that is being used for the provided string.
232-
charcode : CharacterCodeType
233-
The character code to record.
230+
chars : str or CharacterCodeType
231+
The character(s) to record. This may be a single character code, or multiple
232+
characters in a string, if the glyph maps to several characters. It will be
233+
normalized to a string internally.
234234
glyph : GlyphIndexType
235235
The corresponding glyph index to record.
236236
@@ -243,13 +243,21 @@ def track_glyph(
243243
The character code within the above subset. If *subset_size* was not
244244
specified on this instance, then this is just *charcode* unmodified.
245245
"""
246+
if isinstance(chars, str):
247+
charcode = ord(chars[0])
248+
else:
249+
charcode = chars
250+
chars = chr(chars)
251+
246252
glyph_map = self.glyph_maps.setdefault(font.fname, GlyphMap())
247-
if result := glyph_map.get(charcode, glyph):
253+
if result := glyph_map.get(chars, glyph):
248254
return result
249255

250256
subset_maps = self.used.setdefault(font.fname, [{}])
251-
# Default to preserving the character code as it was.
252257
use_next_charmap = (
258+
# Multi-character glyphs always go in the non-0 subset.
259+
len(chars) > 1 or
260+
# Default to preserving the character code as it was.
253261
self.subset_size != 0
254262
and (
255263
# But start filling a new subset if outside the first block; this
@@ -270,11 +278,11 @@ def track_glyph(
270278
subset = 0
271279
subset_charcode = charcode
272280
subset_maps[subset][subset_charcode] = glyph
273-
glyph_map.add(charcode, glyph, subset, subset_charcode)
281+
glyph_map.add(chars, glyph, subset, subset_charcode)
274282
return (subset, subset_charcode)
275283

276284
def subset_to_unicode(self, fontname: str, subset: int,
277-
subset_charcode: CharacterCodeType) -> CharacterCodeType:
285+
subset_charcode: CharacterCodeType) -> str:
278286
"""
279287
Map a subset index and character code to a Unicode character code.
280288
@@ -289,8 +297,8 @@ def subset_to_unicode(self, fontname: str, subset: int,
289297
290298
Returns
291299
-------
292-
CharacterCodeType
293-
The Unicode character code corresponding to the subsetted one.
300+
str
301+
The Unicode character(s) corresponding to the subsetted character code.
294302
"""
295303
return self.glyph_maps[fontname].iget(subset, subset_charcode)[0]
296304

lib/matplotlib/backends/backend_pdf.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,8 @@ def _embedTeXFont(self, dvifont):
996996
# for that subset, and compute various properties based on the encoding.
997997
charmap = self._character_tracker.used[dvifont.fname][0]
998998
chars = {
999-
self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
999+
# DVI fonts always map single glyph to single character.
1000+
ord(self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode))
10001001
for ccode in charmap
10011002
}
10021003
t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
@@ -1150,10 +1151,10 @@ def generate_unicode_cmap(subset_index, charmap):
11501151
last_ccode = ccode
11511152

11521153
def _to_unicode(ccode):
1153-
real_ccode = self._character_tracker.subset_to_unicode(
1154+
chars = self._character_tracker.subset_to_unicode(
11541155
filename, subset_index, ccode)
1155-
unicodestr = chr(real_ccode).encode('utf-16be').hex()
1156-
return f'<{unicodestr}>'
1156+
hexstr = chars.encode('utf-16be').hex()
1157+
return f'<{hexstr}>'
11571158

11581159
width = 2 if fonttype == 3 else 4
11591160
unicode_bfrange = []
@@ -2332,7 +2333,7 @@ def output_singlebyte_chunk(kerns_or_chars):
23322333
for item in _text_helpers.layout(s, font, kern_mode=Kerning.UNFITTED,
23332334
language=language):
23342335
subset, charcode = self.file._character_tracker.track_glyph(
2335-
item.ft_object, ord(item.char), item.glyph_index)
2336+
item.ft_object, item.char, item.glyph_index)
23362337
if (item.ft_object, subset) != prev_font:
23372338
if singlebyte_chunk:
23382339
output_singlebyte_chunk(singlebyte_chunk)

0 commit comments

Comments
 (0)