Skip to content

Commit 5384aa4

Browse files
committed
Comment and (type) annotate "prescanning" module.
Reduce the use of "self" variables as for local temporary variables. Make private private functions.
1 parent 3386823 commit 5384aa4

File tree

2 files changed

+140
-78
lines changed

2 files changed

+140
-78
lines changed

mathics_scanner/prescanner.py

Lines changed: 139 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,42 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
Module for "prescanning". Right now this just means replacing
4+
character escape sequences.
5+
"""
6+
7+
from typing import Callable
28

39
from mathics_scanner.characters import named_characters
410
from mathics_scanner.errors import ScanError, IncompleteSyntaxError
511

612

713
class Prescanner(object):
814
r"""
9-
Converts:
10-
character codes to characters:
15+
A Class for converting escape sequences:
16+
Character codes to characters:
1117
\.7A -> z
18+
\.004a -> J
1219
\:004a -> J
1320
\|01D451 -> \U0001D451
1421
\041 -> !
15-
unicode longnames to characters:
22+
Named Characters to Unicode:
1623
\[Theta] -> \u03B8
17-
escape sequences:
24+
ASCII escape sequence:
1825
\n -> literal \n
1926
20-
Also reports trailing \ characters as incomplete.
21-
22-
PreScanner works by breaking the partitioning code into stubs.
27+
Trailing backslash characters (\) are reported incomplete.
2328
"""
2429

25-
def __init__(self, feeder):
26-
self.feeder = feeder # returns more code when asked
30+
def __init__(self, feeder: Callable):
31+
# feeder is a function that returns the next line of the Mathics input
32+
self.feeder = feeder
2733
self.code = feeder.feed() # input code
2834
self.pos = 0 # current position within code
2935

30-
def feed(self):
36+
def feed(self) -> str:
37+
"""
38+
Return the next line of Mathics input
39+
"""
3140
return self.feeder.feed()
3241

3342
def incomplete(self):
@@ -37,89 +46,142 @@ def incomplete(self):
3746
raise IncompleteSyntaxError()
3847
self.code += line
3948

40-
def scan(self):
41-
# main loop
42-
self.stubs = [] # stubs of code to be joined
43-
self.start = self.pos # start of current stub
49+
def scan(self) -> str:
50+
"""
51+
Replace escape sequences in self.code. The replacement string is returned.
52+
Note: self.code is not modified.
53+
"""
54+
55+
line_fragments = (
56+
[]
57+
) # line fragments to be joined before returning from this method.
58+
self.fragment_start = (
59+
self.pos
60+
) # start position of line fragment under consideration
61+
62+
def start_new_fragment(pos: int) -> None:
63+
"""
64+
Update position markers to start a new line fragment at ``pos``.
65+
"""
66+
self.pos = pos
67+
self.fragment_start = pos
68+
69+
def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
70+
"""
71+
See if characters self.pos+start_shift .. self.pos+end shift
72+
can be converted to an integer in base ``base``.
73+
74+
If so, we append, the characters before the escape sequence without the
75+
escaping characters like ``\.`` or ``\:``.
76+
77+
We also append the converted integer to ``line_fragments``, and updated
78+
position cursors for a new line fragment.
79+
80+
However, if the conversion fails, error messages are issued and nothing
81+
is updated
82+
"""
83+
start, end = self.pos + start_shift, self.pos + end_shift
84+
result = None
85+
if end <= len(self.code):
86+
text = self.code[start:end]
87+
try:
88+
result = int(text, base)
89+
except ValueError:
90+
pass # result remains None
91+
if result is None:
92+
last = end - start
93+
if last == 2:
94+
self.feeder.message("Syntax", "sntoct2")
95+
elif last == 3:
96+
self.feeder.message("Syntax", "sntoct1")
97+
elif last == 4:
98+
self.feeder.message("Syntax", "snthex")
99+
else:
100+
raise ValueError()
101+
self.feeder.message(
102+
"Syntax", "sntxb", self.code[self.pos :].rstrip("\n")
103+
)
104+
raise ScanError()
105+
line_fragments.append(self.code[start : self.pos])
106+
line_fragments.append(chr(result))
107+
108+
# Set up a new line fragment for the next time we are called.
109+
start_new_fragment(end)
110+
111+
def try_parse_named_character(start_shift: int):
112+
"""Before calling we have matched "\[". Scan to the remaining "]" and
113+
try to match what is found in-between with a known named
114+
character, e.g. "Theta". If we can match this, we store
115+
the unicode character equivalent in ``line_fragments``.
116+
If we can't find a named character, error messages are
117+
issued and we leave ``line_fragments`` untouched.
118+
"""
119+
i = self.pos + start_shift
120+
while True:
121+
if i == len(self.code):
122+
self.incomplete()
123+
if self.code[i] == "]":
124+
break
125+
i += 1
126+
127+
named_character = self.code[self.pos + start_shift : i]
128+
if named_character.isalpha():
129+
char = named_characters.get(named_character)
130+
if char is None:
131+
self.feeder.message("Syntax", "sntufn", named_character)
132+
# stay in same line fragment
133+
else:
134+
line_fragments.append(self.code[self.fragment_start : self.pos])
135+
line_fragments.append(char)
136+
start_new_fragment(i + 1)
137+
138+
# Stay in same line fragment, but advance the cursor position.
139+
self.pos = i + 1
140+
141+
# In the following loop, we look for and replace escape
142+
# sequences. The current character under consideration is at
143+
# self.code[self.pos]. When an escape sequence is found at
144+
# that position, the previous line_fragment is extracted and
145+
# stored in ``line_fragments``. The start-position marker for the
146+
# next line_fragment is started and self.pos is updated.
147+
44148
while self.pos < len(self.code):
45149
if self.code[self.pos] == "\\":
150+
# Look for and handle an escape sequence.
46151
if self.pos + 1 == len(self.code):
47152
self.incomplete()
48153
c = self.code[self.pos + 1]
49154
if c == "|":
50-
self.try_parse_base(2, 8, 16)
155+
try_parse_base(2, 8, 16)
51156
if c == ".":
52-
self.try_parse_base(2, 4, 16)
157+
# See if we have a two-digit hexadecimal number.
158+
try_parse_base(2, 4, 16)
53159
elif c == ":":
54-
self.try_parse_base(2, 6, 16)
160+
# See if we have a four-digit hexadecimal number.
161+
try_parse_base(2, 6, 16)
55162
elif c == "[":
56-
self.try_parse_longname(2)
163+
try_parse_named_character(2)
57164
elif c in "01234567":
58-
self.try_parse_base(1, 4, 8)
165+
# See if we have an octal number.
166+
try_parse_base(1, 4, 8)
59167
elif c == "\n":
60168
if self.pos + 2 == len(self.code):
61169
self.incomplete()
62-
self.stubs.append(self.code[self.start : self.pos])
63-
self.newstub(self.pos + 2)
170+
self.line_fragments.append(
171+
self.code[self.fragment_start : self.pos]
172+
)
173+
start_new_fragment(self.pos + 2)
64174
else:
65-
# Two backslashes in succession indicates a single backslash character,
66-
# rather than an escape sequence which also starts with a backslash.
67-
# Advance the scanning cursor (self.pos) over both backslashes.
175+
# Two backslashes in succession indicates a single backslash character.
176+
# Advance the scanning cursor (self.pos) over both backslashes.
68177
# Also, Python's backslash escape mechanism turns the two backslashes
69178
# into one in length calculations.
70179
self.pos += 2
71180
else:
72181
self.pos += 1
73-
self.stubs.append(self.code[self.start :]) # final stub
74-
# reduce
75-
return "".join(self.stubs)
76-
77-
def newstub(self, pos: int) -> None:
78-
self.pos = pos
79-
self.start = pos
80-
81-
def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> None:
82-
start, end = self.pos + start_shift, self.pos + end_shift
83-
result = None
84-
if end <= len(self.code):
85-
text = self.code[start:end]
86-
try:
87-
result = int(text, base)
88-
except ValueError:
89-
pass # result remains None
90-
if result is None:
91-
last = end - start
92-
if last == 2:
93-
self.feeder.message("Syntax", "sntoct2")
94-
elif last == 3:
95-
self.feeder.message("Syntax", "sntoct1")
96-
elif last == 4:
97-
self.feeder.message("Syntax", "snthex")
98-
else:
99-
raise ValueError()
100-
self.feeder.message("Syntax", "sntxb", self.code[self.pos :].rstrip("\n"))
101-
raise ScanError()
102-
self.stubs.append(self.code[self.start : self.pos])
103-
self.stubs.append(chr(result))
104-
self.newstub(end)
105-
106-
def try_parse_longname(self, start_shift):
107-
i = self.pos + start_shift
108-
while True:
109-
if i == len(self.code):
110-
self.incomplete()
111-
if self.code[i] == "]":
112-
break
113-
i += 1
114-
115-
longname = self.code[self.pos + start_shift : i]
116-
if longname.isalpha():
117-
char = named_characters.get(longname)
118-
if char is None:
119-
self.feeder.message("Syntax", "sntufn", longname)
120-
pass # stay in same stub
121-
else:
122-
self.stubs.append(self.code[self.start : self.pos])
123-
self.stubs.append(char)
124-
self.newstub(i + 1)
125-
self.pos = i + 1 # stay in same stub but skip ahead
182+
183+
# Close out final line fragment
184+
line_fragments.append(self.code[self.fragment_start :])
185+
186+
# produce and return the replacement string.
187+
return "".join(line_fragments)

test/test_prescanner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def equal(self, code, result):
2424
def equal_length(self, code, length):
2525
assert len(self.prescan(code)) == length
2626

27-
def test_longnames(self):
27+
def test_named_characters(self):
2828
self.equal(r"\[Theta]", "\u03B8")
2929
self.equal(r"\[CapitalPi]", "\u03A0")
3030
self.equal(r"\[Fake]", r"\[Fake]")

0 commit comments

Comments
 (0)