11# -*- coding: utf-8 -*-
2+ """
3+ Module for "prescanning". Right now this just means replacing
4+ character escape sequences.
5+ """
6+
7+ from typing import Callable
28
39from mathics_scanner .characters import named_characters
410from mathics_scanner .errors import ScanError , IncompleteSyntaxError
511
612
713class Prescanner (object ):
814 r"""
9- Converts :
10- character codes to characters:
15+ A Class for converting escape sequences :
16+ Character codes to characters:
1117 \.7A -> z
18+ \.004a -> J
1219 \:004a -> J
1320 \|01D451 -> \U0001D451
1421 \041 -> !
15- unicode longnames to characters :
22+ Named Characters to Unicode :
1623 \[Theta] -> \u03B8
17- escape sequences :
24+ ASCII escape sequence :
1825 \n -> literal \n
1926
20- Also reports trailing \ characters as incomplete.
21-
22- PreScanner works by breaking the partitioning code into stubs.
27+ Trailing backslash characters (\) are reported incomplete.
2328 """
2429
25- def __init__ (self , feeder ):
26- self .feeder = feeder # returns more code when asked
30+ def __init__ (self , feeder : Callable ):
31+ # feeder is a function that returns the next line of the Mathics input
32+ self .feeder = feeder
2733 self .code = feeder .feed () # input code
2834 self .pos = 0 # current position within code
2935
30- def feed (self ):
36+ def feed (self ) -> str :
37+ """
38+ Return the next line of Mathics input
39+ """
3140 return self .feeder .feed ()
3241
3342 def incomplete (self ):
@@ -37,89 +46,148 @@ def incomplete(self):
3746 raise IncompleteSyntaxError ()
3847 self .code += line
3948
40- def scan (self ):
41- # main loop
42- self .stubs = [] # stubs of code to be joined
43- self .start = self .pos # start of current stub
49+ def replace_escape_sequences (self ) -> str :
50+ """
51+ Replace escape sequences in self.code. The replacement string is returned.
52+ Note: self.code is not modified.
53+ """
54+
55+ # Line fragments to be joined before returning from this method.
56+ line_fragments = []
57+
58+ # Fragment start position of line fragment under consideration.
59+ self .fragment_start = self .pos
60+
61+ def start_new_fragment (pos : int ) -> None :
62+ """
63+ Update position markers to start a new line fragment at ``pos``.
64+ """
65+ self .pos = pos
66+ self .fragment_start = pos
67+
68+ def try_parse_base (start_shift : int , end_shift : int , base : int ) -> None :
69+ """
70+ See if characters self.pos+start_shift .. self.pos+end shift
71+ can be converted to an integer in base ``base``.
72+
73+ If so, we append the characters before the escape sequence without the
74+ escaping characters like ``\.`` or ``\:``.
75+
76+ We also append the converted integer to ``line_fragments``, and update
77+ position cursors for a new line fragment.
78+
79+ However, if the conversion fails, then error messages are
80+ issued and nothing is updated
81+ """
82+ start , end = self .pos + start_shift , self .pos + end_shift
83+ result = None
84+ if end <= len (self .code ):
85+ text = self .code [start :end ]
86+ try :
87+ result = int (text , base )
88+ except ValueError :
89+ pass # result remains None
90+ if result is None :
91+ last = end - start
92+ if last == 2 :
93+ self .feeder .message ("Syntax" , "sntoct2" )
94+ elif last == 3 :
95+ self .feeder .message ("Syntax" , "sntoct1" )
96+ elif last == 4 :
97+ self .feeder .message ("Syntax" , "snthex" )
98+ else :
99+ raise ValueError ()
100+ self .feeder .message (
101+ "Syntax" , "sntxb" , self .code [self .pos :].rstrip ("\n " )
102+ )
103+ raise ScanError ()
104+
105+ # Add text from prior line fragment as well
106+ # as the escape sequence, a character, from the escape sequence
107+ # that was just matched.
108+ line_fragments .append (self .code [start : self .pos ])
109+ line_fragments .append (chr (result ))
110+
111+ # Set up a new line fragment for the next time we are called.
112+ start_new_fragment (end )
113+
114+ def try_parse_named_character (start_shift : int ):
115+ """Before calling we have matched "\[". Scan to the remaining "]" and
116+ try to match what is found in-between with a known named
117+ character, e.g. "Theta". If we can match this, we store
118+ the unicode character equivalent in ``line_fragments``.
119+ If we can't find a named character, error messages are
120+ issued and we leave ``line_fragments`` untouched.
121+ """
122+ i = self .pos + start_shift
123+ while True :
124+ if i == len (self .code ):
125+ self .incomplete ()
126+ if self .code [i ] == "]" :
127+ break
128+ i += 1
129+
130+ named_character = self .code [self .pos + start_shift : i ]
131+ if named_character .isalpha ():
132+ char = named_characters .get (named_character )
133+ if char is None :
134+ self .feeder .message ("Syntax" , "sntufn" , named_character )
135+ # stay in same line fragment
136+ else :
137+ # Add text from prior line fragment as well
138+ # as the escape sequence, a character, from the escape sequence
139+ # just matched.
140+ line_fragments .append (self .code [self .fragment_start : self .pos ])
141+ line_fragments .append (char )
142+ start_new_fragment (i + 1 )
143+
144+ # Stay in same line fragment, but advance the cursor position.
145+ self .pos = i + 1
146+
147+ # In the following loop, we look for and replace escape
148+ # sequences. The current character under consideration is at
149+ # self.code[self.pos]. When an escape sequence is found at
150+ # that position, the previous line_fragment is extracted and
151+ # stored in ``line_fragments``. The start-position marker for the
152+ # next line_fragment is started and self.pos is updated.
153+
44154 while self .pos < len (self .code ):
45155 if self .code [self .pos ] == "\\ " :
156+ # Look for and handle an escape sequence.
46157 if self .pos + 1 == len (self .code ):
47158 self .incomplete ()
48159 c = self .code [self .pos + 1 ]
49160 if c == "|" :
50- self . try_parse_base (2 , 8 , 16 )
161+ try_parse_base (2 , 8 , 16 )
51162 if c == "." :
52- self .try_parse_base (2 , 4 , 16 )
163+ # See if we have a two-digit hexadecimal number.
164+ try_parse_base (2 , 4 , 16 )
53165 elif c == ":" :
54- self .try_parse_base (2 , 6 , 16 )
166+ # See if we have a four-digit hexadecimal number.
167+ try_parse_base (2 , 6 , 16 )
55168 elif c == "[" :
56- self . try_parse_longname (2 )
169+ try_parse_named_character (2 )
57170 elif c in "01234567" :
58- self .try_parse_base (1 , 4 , 8 )
171+ # See if we have an octal number.
172+ try_parse_base (1 , 4 , 8 )
59173 elif c == "\n " :
60174 if self .pos + 2 == len (self .code ):
61175 self .incomplete ()
62- self .stubs .append (self .code [self .start : self .pos ])
63- self .newstub (self .pos + 2 )
176+ self .line_fragments .append (
177+ self .code [self .fragment_start : self .pos ]
178+ )
179+ start_new_fragment (self .pos + 2 )
64180 else :
65- # Two backslashes in succession indicates a single backslash character,
66- # rather than an escape sequence which also starts with a backslash.
67- # Advance the scanning cursor (self.pos) over both backslashes.
181+ # Two backslashes in succession indicates a single backslash character.
182+ # Advance the scanning cursor (self.pos) over both backslashes.
68183 # Also, Python's backslash escape mechanism turns the two backslashes
69184 # into one in length calculations.
70185 self .pos += 2
71186 else :
72187 self .pos += 1
73- self .stubs .append (self .code [self .start :]) # final stub
74- # reduce
75- return "" .join (self .stubs )
76-
77- def newstub (self , pos : int ) -> None :
78- self .pos = pos
79- self .start = pos
80-
81- def try_parse_base (self , start_shift : int , end_shift : int , base : int ) -> None :
82- start , end = self .pos + start_shift , self .pos + end_shift
83- result = None
84- if end <= len (self .code ):
85- text = self .code [start :end ]
86- try :
87- result = int (text , base )
88- except ValueError :
89- pass # result remains None
90- if result is None :
91- last = end - start
92- if last == 2 :
93- self .feeder .message ("Syntax" , "sntoct2" )
94- elif last == 3 :
95- self .feeder .message ("Syntax" , "sntoct1" )
96- elif last == 4 :
97- self .feeder .message ("Syntax" , "snthex" )
98- else :
99- raise ValueError ()
100- self .feeder .message ("Syntax" , "sntxb" , self .code [self .pos :].rstrip ("\n " ))
101- raise ScanError ()
102- self .stubs .append (self .code [self .start : self .pos ])
103- self .stubs .append (chr (result ))
104- self .newstub (end )
105-
106- def try_parse_longname (self , start_shift ):
107- i = self .pos + start_shift
108- while True :
109- if i == len (self .code ):
110- self .incomplete ()
111- if self .code [i ] == "]" :
112- break
113- i += 1
114-
115- longname = self .code [self .pos + start_shift : i ]
116- if longname .isalpha ():
117- char = named_characters .get (longname )
118- if char is None :
119- self .feeder .message ("Syntax" , "sntufn" , longname )
120- pass # stay in same stub
121- else :
122- self .stubs .append (self .code [self .start : self .pos ])
123- self .stubs .append (char )
124- self .newstub (i + 1 )
125- self .pos = i + 1 # stay in same stub but skip ahead
188+
189+ # Add the final line fragment.
190+ line_fragments .append (self .code [self .fragment_start :])
191+
192+ # produce and return the replacement string.
193+ return "" .join (line_fragments )
0 commit comments