Skip to content

Commit ef7a4e2

Browse files
committed
pdf: Correct Unicode mapping for out-of-range font chunks
For Type 3 fonts, add a `ToUnicode` mapping (which was added in PDF 1.2), and for Type 42 fonts, correct the Unicode encoding, which should be UTF-16BE, not UCS2.
1 parent 761cbaf commit ef7a4e2

File tree

2 files changed

+64
-31
lines changed

2 files changed

+64
-31
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,25 @@ def track_glyph(
205205
self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
206206
return (subset, subset_charcode)
207207

208+
def subset_to_unicode(self, index: int,
209+
charcode: CharacterCodeType) -> CharacterCodeType:
210+
"""
211+
Map a subset index and character code to a Unicode character code.
212+
213+
Parameters
214+
----------
215+
index : int
216+
The subset index within a font.
217+
charcode : CharacterCodeType
218+
The character code within a subset to map back.
219+
220+
Returns
221+
-------
222+
CharacterCodeType
223+
The Unicode character code corresponding to the subsetted one.
224+
"""
225+
return index * self.subset_size + charcode
226+
208227

209228
class RendererPDFPSBase(RendererBase):
210229
# The following attributes must be defined by the subclasses:

lib/matplotlib/backends/backend_pdf.py

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ def writeFonts(self):
950950
_log.debug('Writing TrueType font.')
951951
charmap = self._character_tracker.used.get((filename, subset))
952952
if charmap:
953-
fonts[Fx] = self.embedTTF(filename, charmap)
953+
fonts[Fx] = self.embedTTF(filename, subset, charmap)
954954
self.writeObject(self.fontObject, fonts)
955955

956956
def _write_afm_font(self, filename):
@@ -1117,7 +1117,7 @@ def createType1Descriptor(self, t1font, fontfile=None):
11171117
end
11181118
end"""
11191119

1120-
def embedTTF(self, filename, charmap):
1120+
def embedTTF(self, filename, subset_index, charmap):
11211121
"""Embed the TTF font from the named file into the document."""
11221122
font = get_font(filename)
11231123
fonttype = mpl.rcParams['pdf.fonttype']
@@ -1133,12 +1133,40 @@ def cvt(length, upe=font.units_per_EM, nearest=True):
11331133
else:
11341134
return math.ceil(value)
11351135

1136-
def embedTTFType3(font, charmap, descriptor):
1136+
def generate_unicode_cmap(subset_index, charmap):
1137+
# Make the ToUnicode CMap.
1138+
last_ccode = -2
1139+
unicode_groups = []
1140+
for ccode in sorted(charmap.keys()):
1141+
if ccode != last_ccode + 1:
1142+
unicode_groups.append([ccode, ccode])
1143+
else:
1144+
unicode_groups[-1][1] = ccode
1145+
last_ccode = ccode
1146+
1147+
width = 2 if fonttype == 3 else 4
1148+
unicode_bfrange = []
1149+
for start, end in unicode_groups:
1150+
real_start = self._character_tracker.subset_to_unicode(subset_index,
1151+
start)
1152+
real_end = self._character_tracker.subset_to_unicode(subset_index, end)
1153+
real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
1154+
for x in range(real_start, real_end+1))
1155+
unicode_bfrange.append(
1156+
f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
1157+
unicode_cmap = (self._identityToUnicodeCMap %
1158+
(len(unicode_groups),
1159+
'\n'.join(unicode_bfrange).encode('ascii')))
1160+
1161+
return unicode_cmap
1162+
1163+
def embedTTFType3(font, subset_index, charmap, descriptor):
11371164
"""The Type 3-specific part of embedding a Truetype font"""
11381165
widthsObject = self.reserveObject('font widths')
11391166
fontdescObject = self.reserveObject('font descriptor')
11401167
fontdictObject = self.reserveObject('font dictionary')
11411168
charprocsObject = self.reserveObject('character procs')
1169+
toUnicodeMapObject = self.reserveObject('ToUnicode map')
11421170
differencesArray = []
11431171
firstchar, lastchar = min(charmap), max(charmap)
11441172
bbox = [cvt(x, nearest=False) for x in font.bbox]
@@ -1157,8 +1185,9 @@ def embedTTFType3(font, charmap, descriptor):
11571185
'Encoding': {
11581186
'Type': Name('Encoding'),
11591187
'Differences': differencesArray},
1160-
'Widths': widthsObject
1161-
}
1188+
'Widths': widthsObject,
1189+
'ToUnicode': toUnicodeMapObject,
1190+
}
11621191

11631192
# Make the "Widths" array
11641193
def get_char_width(charcode):
@@ -1191,15 +1220,18 @@ def get_char_width(charcode):
11911220
self.outputStream(charprocObject, stream)
11921221
charprocs[charname] = charprocObject
11931222

1223+
unicode_cmap = generate_unicode_cmap(subset_index, charmap)
1224+
11941225
# Write everything out
11951226
self.writeObject(fontdictObject, fontdict)
11961227
self.writeObject(fontdescObject, descriptor)
11971228
self.writeObject(widthsObject, widths)
11981229
self.writeObject(charprocsObject, charprocs)
1230+
self.outputStream(toUnicodeMapObject, unicode_cmap)
11991231

12001232
return fontdictObject
12011233

1202-
def embedTTFType42(font, charmap, descriptor):
1234+
def embedTTFType42(font, subset_index, charmap, descriptor):
12031235
"""The Type 42-specific part of embedding a Truetype font"""
12041236
fontdescObject = self.reserveObject('font descriptor')
12051237
cidFontDictObject = self.reserveObject('CID font dictionary')
@@ -1209,12 +1241,12 @@ def embedTTFType42(font, charmap, descriptor):
12091241
wObject = self.reserveObject('Type 0 widths')
12101242
toUnicodeMapObject = self.reserveObject('ToUnicode map')
12111243

1212-
_log.debug("SUBSET %s characters: %s", filename, charmap)
1244+
_log.debug("SUBSET %s:%d characters: %s", filename, subset_index, charmap)
12131245
with _backend_pdf_ps.get_glyphs_subset(filename,
12141246
charmap.values()) as subset:
12151247
fontdata = _backend_pdf_ps.font_as_file(subset)
12161248
_log.debug(
1217-
"SUBSET %s %d -> %d", filename,
1249+
"SUBSET %s:%d %d -> %d", filename, subset_index,
12181250
os.stat(filename).st_size, fontdata.getbuffer().nbytes
12191251
)
12201252

@@ -1251,55 +1283,37 @@ def embedTTFType42(font, charmap, descriptor):
12511283
fontfileObject, fontdata.getvalue(),
12521284
extra={'Length1': fontdata.getbuffer().nbytes})
12531285

1254-
# Make the 'W' (Widths) array, CidToGidMap and ToUnicode CMap
1255-
# at the same time
1286+
# Make the 'W' (Widths) array and CidToGidMap at the same time.
12561287
cid_to_gid_map = ['\0'] * 65536
12571288
widths = []
12581289
max_ccode = 0
12591290
for ccode, gind in charmap.items():
12601291
glyph = font.load_glyph(gind,
12611292
flags=LoadFlags.NO_SCALE | LoadFlags.NO_HINTING)
12621293
widths.append((ccode, cvt(glyph.horiAdvance)))
1263-
if ccode < 65536:
1264-
cid_to_gid_map[ccode] = chr(gind)
1294+
cid_to_gid_map[ccode] = chr(gind)
12651295
max_ccode = max(ccode, max_ccode)
12661296
widths.sort()
12671297
cid_to_gid_map = cid_to_gid_map[:max_ccode + 1]
12681298

12691299
last_ccode = -2
12701300
w = []
12711301
max_width = 0
1272-
unicode_groups = []
12731302
for ccode, width in widths:
12741303
if ccode != last_ccode + 1:
12751304
w.append(ccode)
12761305
w.append([width])
1277-
unicode_groups.append([ccode, ccode])
12781306
else:
12791307
w[-1].append(width)
1280-
unicode_groups[-1][1] = ccode
12811308
max_width = max(max_width, width)
12821309
last_ccode = ccode
12831310

1284-
unicode_bfrange = []
1285-
for start, end in unicode_groups:
1286-
# Ensure the CID map contains only chars from BMP
1287-
if start > 65535:
1288-
continue
1289-
end = min(65535, end)
1290-
1291-
unicode_bfrange.append(
1292-
b"<%04x> <%04x> [%s]" %
1293-
(start, end,
1294-
b" ".join(b"<%04x>" % x for x in range(start, end+1))))
1295-
unicode_cmap = (self._identityToUnicodeCMap %
1296-
(len(unicode_groups), b"\n".join(unicode_bfrange)))
1297-
12981311
# CIDToGIDMap stream
12991312
cid_to_gid_map = "".join(cid_to_gid_map).encode("utf-16be")
13001313
self.outputStream(cidToGidMapObject, cid_to_gid_map)
13011314

13021315
# ToUnicode CMap
1316+
unicode_cmap = generate_unicode_cmap(subset_index, charmap)
13031317
self.outputStream(toUnicodeMapObject, unicode_cmap)
13041318

13051319
descriptor['MaxWidth'] = max_width
@@ -1355,9 +1369,9 @@ def embedTTFType42(font, charmap, descriptor):
13551369
}
13561370

13571371
if fonttype == 3:
1358-
return embedTTFType3(font, charmap, descriptor)
1372+
return embedTTFType3(font, subset_index, charmap, descriptor)
13591373
elif fonttype == 42:
1360-
return embedTTFType42(font, charmap, descriptor)
1374+
return embedTTFType42(font, subset_index, charmap, descriptor)
13611375

13621376
def alphaState(self, alpha):
13631377
"""Return name of an ExtGState that sets alpha to the given value."""

0 commit comments

Comments
 (0)