Skip to content

Commit 8db3ab1

Browse files
authored
Merge pull request #50 from Mathics3/comment-prescanner
Comment and (type) annotate "prescanning" module.
2 parents 3386823 + 4562830 commit 8db3ab1

File tree

3 files changed

+246
-161
lines changed

3 files changed

+246
-161
lines changed

mathics_scanner/prescanner.py

Lines changed: 145 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,42 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
Module for "prescanning". Right now this just means replacing
4+
character escape sequences.
5+
"""
6+
7+
from typing import Callable
28

39
from mathics_scanner.characters import named_characters
410
from mathics_scanner.errors import ScanError, IncompleteSyntaxError
511

612

713
class Prescanner(object):
814
r"""
9-
Converts:
10-
character codes to characters:
15+
A Class for converting escape sequences:
16+
Character codes to characters:
1117
\.7A -> z
18+
\.004a -> J
1219
\:004a -> J
1320
\|01D451 -> \U0001D451
1421
\041 -> !
15-
unicode longnames to characters:
22+
Named Characters to Unicode:
1623
\[Theta] -> \u03B8
17-
escape sequences:
24+
ASCII escape sequence:
1825
\n -> literal \n
1926
20-
Also reports trailing \ characters as incomplete.
21-
22-
PreScanner works by breaking the partitioning code into stubs.
27+
Trailing backslash characters (\) are reported incomplete.
2328
"""
2429

25-
def __init__(self, feeder):
26-
self.feeder = feeder # returns more code when asked
30+
def __init__(self, feeder: Callable):
31+
# feeder is a function that returns the next line of the Mathics input
32+
self.feeder = feeder
2733
self.code = feeder.feed() # input code
2834
self.pos = 0 # current position within code
2935

30-
def feed(self):
36+
def feed(self) -> str:
37+
"""
38+
Return the next line of Mathics input
39+
"""
3140
return self.feeder.feed()
3241

3342
def incomplete(self):
@@ -37,89 +46,148 @@ def incomplete(self):
3746
raise IncompleteSyntaxError()
3847
self.code += line
3948

40-
def scan(self):
41-
# main loop
42-
self.stubs = [] # stubs of code to be joined
43-
self.start = self.pos # start of current stub
49+
def replace_escape_sequences(self) -> str:
50+
"""
51+
Replace escape sequences in self.code. The replacement string is returned.
52+
Note: self.code is not modified.
53+
"""
54+
55+
# Line fragments to be joined before returning from this method.
56+
line_fragments = []
57+
58+
# Fragment start position of line fragment under consideration.
59+
self.fragment_start = self.pos
60+
61+
def start_new_fragment(pos: int) -> None:
62+
"""
63+
Update position markers to start a new line fragment at ``pos``.
64+
"""
65+
self.pos = pos
66+
self.fragment_start = pos
67+
68+
def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
69+
"""
70+
See if characters self.pos+start_shift .. self.pos+end shift
71+
can be converted to an integer in base ``base``.
72+
73+
If so, we append the characters before the escape sequence without the
74+
escaping characters like ``\.`` or ``\:``.
75+
76+
We also append the converted integer to ``line_fragments``, and update
77+
position cursors for a new line fragment.
78+
79+
However, if the conversion fails, then error messages are
80+
issued and nothing is updated
81+
"""
82+
start, end = self.pos + start_shift, self.pos + end_shift
83+
result = None
84+
if end <= len(self.code):
85+
text = self.code[start:end]
86+
try:
87+
result = int(text, base)
88+
except ValueError:
89+
pass # result remains None
90+
if result is None:
91+
last = end - start
92+
if last == 2:
93+
self.feeder.message("Syntax", "sntoct2")
94+
elif last == 3:
95+
self.feeder.message("Syntax", "sntoct1")
96+
elif last == 4:
97+
self.feeder.message("Syntax", "snthex")
98+
else:
99+
raise ValueError()
100+
self.feeder.message(
101+
"Syntax", "sntxb", self.code[self.pos :].rstrip("\n")
102+
)
103+
raise ScanError()
104+
105+
# Add text from prior line fragment as well
106+
# as the escape sequence, a character, from the escape sequence
107+
# that was just matched.
108+
line_fragments.append(self.code[start : self.pos])
109+
line_fragments.append(chr(result))
110+
111+
# Set up a new line fragment for the next time we are called.
112+
start_new_fragment(end)
113+
114+
def try_parse_named_character(start_shift: int):
115+
"""Before calling we have matched "\[". Scan to the remaining "]" and
116+
try to match what is found in-between with a known named
117+
character, e.g. "Theta". If we can match this, we store
118+
the unicode character equivalent in ``line_fragments``.
119+
If we can't find a named character, error messages are
120+
issued and we leave ``line_fragments`` untouched.
121+
"""
122+
i = self.pos + start_shift
123+
while True:
124+
if i == len(self.code):
125+
self.incomplete()
126+
if self.code[i] == "]":
127+
break
128+
i += 1
129+
130+
named_character = self.code[self.pos + start_shift : i]
131+
if named_character.isalpha():
132+
char = named_characters.get(named_character)
133+
if char is None:
134+
self.feeder.message("Syntax", "sntufn", named_character)
135+
# stay in same line fragment
136+
else:
137+
# Add text from prior line fragment as well
138+
# as the escape sequence, a character, from the escape sequence
139+
# just matched.
140+
line_fragments.append(self.code[self.fragment_start : self.pos])
141+
line_fragments.append(char)
142+
start_new_fragment(i + 1)
143+
144+
# Stay in same line fragment, but advance the cursor position.
145+
self.pos = i + 1
146+
147+
# In the following loop, we look for and replace escape
148+
# sequences. The current character under consideration is at
149+
# self.code[self.pos]. When an escape sequence is found at
150+
# that position, the previous line_fragment is extracted and
151+
# stored in ``line_fragments``. The start-position marker for the
152+
# next line_fragment is started and self.pos is updated.
153+
44154
while self.pos < len(self.code):
45155
if self.code[self.pos] == "\\":
156+
# Look for and handle an escape sequence.
46157
if self.pos + 1 == len(self.code):
47158
self.incomplete()
48159
c = self.code[self.pos + 1]
49160
if c == "|":
50-
self.try_parse_base(2, 8, 16)
161+
try_parse_base(2, 8, 16)
51162
if c == ".":
52-
self.try_parse_base(2, 4, 16)
163+
# See if we have a two-digit hexadecimal number.
164+
try_parse_base(2, 4, 16)
53165
elif c == ":":
54-
self.try_parse_base(2, 6, 16)
166+
# See if we have a four-digit hexadecimal number.
167+
try_parse_base(2, 6, 16)
55168
elif c == "[":
56-
self.try_parse_longname(2)
169+
try_parse_named_character(2)
57170
elif c in "01234567":
58-
self.try_parse_base(1, 4, 8)
171+
# See if we have an octal number.
172+
try_parse_base(1, 4, 8)
59173
elif c == "\n":
60174
if self.pos + 2 == len(self.code):
61175
self.incomplete()
62-
self.stubs.append(self.code[self.start : self.pos])
63-
self.newstub(self.pos + 2)
176+
self.line_fragments.append(
177+
self.code[self.fragment_start : self.pos]
178+
)
179+
start_new_fragment(self.pos + 2)
64180
else:
65-
# Two backslashes in succession indicates a single backslash character,
66-
# rather than an escape sequence which also starts with a backslash.
67-
# Advance the scanning cursor (self.pos) over both backslashes.
181+
# Two backslashes in succession indicates a single backslash character.
182+
# Advance the scanning cursor (self.pos) over both backslashes.
68183
# Also, Python's backslash escape mechanism turns the two backslashes
69184
# into one in length calculations.
70185
self.pos += 2
71186
else:
72187
self.pos += 1
73-
self.stubs.append(self.code[self.start :]) # final stub
74-
# reduce
75-
return "".join(self.stubs)
76-
77-
def newstub(self, pos: int) -> None:
78-
self.pos = pos
79-
self.start = pos
80-
81-
def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> None:
82-
start, end = self.pos + start_shift, self.pos + end_shift
83-
result = None
84-
if end <= len(self.code):
85-
text = self.code[start:end]
86-
try:
87-
result = int(text, base)
88-
except ValueError:
89-
pass # result remains None
90-
if result is None:
91-
last = end - start
92-
if last == 2:
93-
self.feeder.message("Syntax", "sntoct2")
94-
elif last == 3:
95-
self.feeder.message("Syntax", "sntoct1")
96-
elif last == 4:
97-
self.feeder.message("Syntax", "snthex")
98-
else:
99-
raise ValueError()
100-
self.feeder.message("Syntax", "sntxb", self.code[self.pos :].rstrip("\n"))
101-
raise ScanError()
102-
self.stubs.append(self.code[self.start : self.pos])
103-
self.stubs.append(chr(result))
104-
self.newstub(end)
105-
106-
def try_parse_longname(self, start_shift):
107-
i = self.pos + start_shift
108-
while True:
109-
if i == len(self.code):
110-
self.incomplete()
111-
if self.code[i] == "]":
112-
break
113-
i += 1
114-
115-
longname = self.code[self.pos + start_shift : i]
116-
if longname.isalpha():
117-
char = named_characters.get(longname)
118-
if char is None:
119-
self.feeder.message("Syntax", "sntufn", longname)
120-
pass # stay in same stub
121-
else:
122-
self.stubs.append(self.code[self.start : self.pos])
123-
self.stubs.append(char)
124-
self.newstub(i + 1)
125-
self.pos = i + 1 # stay in same stub but skip ahead
188+
189+
# Add the final line fragment.
190+
line_fragments.append(self.code[self.fragment_start :])
191+
192+
# produce and return the replacement string.
193+
return "".join(line_fragments)

mathics_scanner/tokeniser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ def __init__(self, feeder):
357357
self.pos = 0
358358
self.feeder = feeder
359359
self.prescanner = Prescanner(feeder)
360-
self.code = self.prescanner.scan()
360+
self.code = self.prescanner.replace_escape_sequences()
361361
self._change_mode("expr")
362362

363363
def _change_mode(self, mode):
@@ -367,11 +367,11 @@ def _change_mode(self, mode):
367367
self.mode = mode
368368
self.tokens, self.token_indices = self.modes[mode]
369369

370-
# TODO: Rename this to something that remotetly makes sense?
370+
# TODO: Rename this to something that remotely makes sense?
371371
def incomplete(self):
372372
"Get more code from the prescanner and continue."
373373
self.prescanner.incomplete()
374-
self.code += self.prescanner.scan()
374+
self.code += self.prescanner.replace_escape_sequences()
375375

376376
def sntx_message(self, pos=None):
377377
"""Send a message to the feeder."""

0 commit comments

Comments
 (0)