Skip to content

Commit 2d5883f

Browse files
committed
Speedup pdftex.map parsing.
For reminder, pdftex.map is a file that maps tex font names ("cmr10") to filesystem font names ("cmr10.pfb"), together with additional metadata (font encoding, postscript special commands). When using pdf output with usetex, we parse usetex-generated dvi files and then need to locate and load these fonts for embedding into the pdf file, hence then need to parse pdftex.map. On some systems (likely with large texlive installs), pdftex.map can be really large (>10^4 entries), and parsing it is quite slow (>500ms on the matplotlib macos). This patch implements a new (simpler?) parser, which is ~25% faster (so it can cut hundreds of ms on systems with large maps). The patch additionally correctly handles entries of the form `foo <bar.pfb` (i.e., with no postscript font name -- in that case the docs say that the postscript font name is the same as the tfm name). On the other hand, the patch also drops support for quotes around anything but the postscript specials (in accordance with the psfonts.map docs, and the actual pdftex implementation in `src/texk/web2c/pdftexdir/mapfile.c`: `case '"': /* opening quote */` only handles postscript specials). See also changes to test.map for the changes in supported syntax.
1 parent b681abc commit 2d5883f

File tree

3 files changed

+70
-77
lines changed

3 files changed

+70
-77
lines changed

lib/matplotlib/dviread.py

Lines changed: 62 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -880,84 +880,74 @@ def _parse(self, file):
880880
"""
881881
Parse the font mapping file.
882882
883-
The format is, AFAIK: texname fontname [effects and filenames]
884-
Effects are PostScript snippets like ".177 SlantFont",
885-
filenames begin with one or two less-than signs. A filename
886-
ending in enc is an encoding file, other filenames are font
887-
files. This can be overridden with a left bracket: <[foobar
888-
indicates an encoding file named foobar.
889-
890-
There is some difference between <foo.pfb and <<bar.pfb in
891-
subsetting, but I have no example of << in my TeX installation.
883+
The format is (partially) documented at
884+
http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
885+
https://tug.org/texinfohtml/dvips.html#psfonts_002emap
886+
Each line can have the following fields:
887+
888+
- tfmname (first, only required field),
889+
- psname (defaults to tfmname, must come immediately after tfmname if
890+
present),
891+
- fontflags (integer, must come immediately after psname if present,
892+
ignored by us),
893+
- special (SlantFont and ExtendFont, only field that is double-quoted),
894+
- fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
895+
precedes a font, <[ always precedes an encoding, < can precede either
896+
but then an encoding file must have extension .enc; < and << also
897+
request different font subsetting behaviors but we ignore that; < can
898+
be separated from the filename by whitespace).
899+
900+
special, fontfile, and encodingfile can appear in any order.
892901
"""
893902
# If the map file specifies multiple encodings for a font, we
894903
# follow pdfTeX in choosing the last one specified. Such
895904
# entries are probably mistakes but they have occurred.
896905
# http://tex.stackexchange.com/questions/10826/
897-
# http://article.gmane.org/gmane.comp.tex.pdftex/4914
898-
899-
empty_re = re.compile(br'%|\s*$')
900-
word_re = re.compile(
901-
br'''(?x) (?:
902-
"<\[ (?P<enc1> [^"]+ )" | # quoted encoding marked by [
903-
"< (?P<enc2> [^"]+.enc)" | # quoted encoding, ends in .enc
904-
"<<? (?P<file1> [^"]+ )" | # quoted font file name
905-
" (?P<eff1> [^"]+ )" | # quoted effects or font name
906-
<\[ (?P<enc3> \S+ ) | # encoding marked by [
907-
< (?P<enc4> \S+ .enc) | # encoding, ends in .enc
908-
<<? (?P<file2> \S+ ) | # font file name
909-
(?P<eff2> \S+ ) # effects or font name
910-
)''')
911-
effects_re = re.compile(
912-
br'''(?x) (?P<slant> -?[0-9]*(?:\.[0-9]+)) \s* SlantFont
913-
| (?P<extend>-?[0-9]*(?:\.[0-9]+)) \s* ExtendFont''')
914-
915-
lines = (line.strip()
916-
for line in file
917-
if not empty_re.match(line))
918-
for line in lines:
919-
effects, encoding, filename = b'', None, None
920-
words = word_re.finditer(line)
921-
922-
# The named groups are mutually exclusive and are
923-
# referenced below at an estimated order of probability of
924-
# occurrence based on looking at my copy of pdftex.map.
925-
# The font names are probably unquoted:
926-
w = next(words)
927-
texname = w.group('eff2') or w.group('eff1')
928-
w = next(words)
929-
psname = w.group('eff2') or w.group('eff1')
930-
931-
for w in words:
932-
# Any effects are almost always quoted:
933-
eff = w.group('eff1') or w.group('eff2')
934-
if eff:
935-
effects = eff
936-
continue
937-
# Encoding files usually have the .enc suffix
938-
# and almost never need quoting:
939-
enc = (w.group('enc4') or w.group('enc3') or
940-
w.group('enc2') or w.group('enc1'))
941-
if enc:
942-
if encoding is not None:
943-
_log.debug('Multiple encodings for %s = %s',
944-
texname, psname)
945-
encoding = enc
946-
continue
947-
# File names are probably unquoted:
948-
filename = w.group('file2') or w.group('file1')
949-
950-
effects_dict = {}
951-
for match in effects_re.finditer(effects):
952-
slant = match.group('slant')
953-
if slant:
954-
effects_dict['slant'] = float(slant)
955-
else:
956-
effects_dict['extend'] = float(match.group('extend'))
957906

958-
self._font[texname] = PsFont(
959-
texname=texname, psname=psname, effects=effects_dict,
960-
encoding=encoding, filename=filename)
907+
word_re = re.compile(br'"([^"]*)(?:"|$)|(\S+)')
908+
for line in file:
909+
if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
910+
continue
911+
tfmname = basename = special = encodingfile = fontfile = None
912+
matches = word_re.finditer(line)
913+
for match in matches:
914+
quoted, unquoted = match.groups()
915+
if unquoted:
916+
if unquoted.startswith(b"<<"): # font
917+
fontfile = unquoted[2:]
918+
elif unquoted.startswith(b"<["): # encoding
919+
encodingfile = unquoted[2:]
920+
elif unquoted.startswith(b"<"): # font or encoding
921+
if unquoted == b"<":
922+
word = next(filter(None, next(matches).groups()))
923+
if unquoted.endswith(b".enc"):
924+
encodingfile = word
925+
else:
926+
fontfile = word
927+
else:
928+
if unquoted.endswith(b".enc"):
929+
encodingfile = unquoted[1:]
930+
else:
931+
fontfile = unquoted[1:]
932+
elif tfmname is None:
933+
tfmname = unquoted
934+
elif basename is None:
935+
basename = unquoted
936+
elif quoted:
937+
special = quoted
938+
if basename is None:
939+
basename = tfmname
940+
effects = {}
941+
if special:
942+
words = reversed(special.split())
943+
for word in words:
944+
if word == b"SlantFont":
945+
effects["slant"] = float(next(words))
946+
elif word == b"ExtendFont":
947+
effects["extend"] = float(next(words))
948+
self._font[tfmname] = PsFont(
949+
texname=tfmname, psname=basename, effects=effects,
950+
encoding=encodingfile, filename=fontfile)
961951

962952

963953
@_api.deprecated("3.3")
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
% used by test_dviread.py
2-
TeXfont1 PSfont1 <font1.pfb "<font1.enc"
3-
TeXfont2 PSfont2 <font2.enc "<font2.pfa"
4-
"TeXfont3" PSfont3 "1.23 UnknownEffect" <[enc3.foo <font3.pfa
2+
TeXfont1 PSfont1 <font1.pfb <font1.enc
3+
TeXfont2 PSfont2 <font2.enc <font2.pfa
4+
TeXfont3 PSfont3 "1.23 UnknownEffect" <[enc3.foo <font3.pfa
55
TeXfont4 PSfont4 "-0.1 SlantFont 2.2 ExtendFont" <font4.enc <font4.pfa
6-
TeXfont5 "PSfont5" <encoding1.enc <encoding2.enc <font5.pfb
6+
TeXfont5 PSfont5 <encoding1.enc <encoding2.enc <font5.pfb
77
TeXfont6 PSfont6
88
TeXfont7 PSfont7 <font7.enc
99
TeXfont8 PSfont8 <font8.pfb
10-
TeXfont9 PSfont9 </absolute/font9.pfb
10+
TeXfont9 </absolute/font9.pfb

lib/matplotlib/tests/test_dviread.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ def test_PsfontsMap(monkeypatch):
4242
assert entry.filename == b'font8.pfb'
4343
assert entry.encoding is None
4444
entry = fontmap[b'TeXfont9']
45+
assert entry.psname == b'TeXfont9'
4546
assert entry.filename == b'/absolute/font9.pfb'
4647
# Missing font
4748
with pytest.raises(KeyError, match='no-such-font'):
4849
fontmap[b'no-such-font']
50+
with pytest.raises(KeyError, match='%'):
51+
fontmap[b'%']
4952

5053

5154
@pytest.mark.skipif(shutil.which("kpsewhich") is None,

0 commit comments

Comments
 (0)