11# -*- coding: utf-8 -*-
2+ """
3+ Module for "prescanning". Right now this just means replacing
4+ character escape sequences.
5+ """
6+
7+ from typing import Callable
28
39from mathics_scanner .characters import named_characters
410from mathics_scanner .errors import ScanError , IncompleteSyntaxError
511
612
713class Prescanner (object ):
814 r"""
9- Converts :
10- character codes to characters:
15+ A Class for converting escape sequences :
16+ Character codes to characters:
1117 \.7A -> z
18+ \.004a -> J
1219 \:004a -> J
1320 \|01D451 -> \U0001D451
1421 \041 -> !
15- unicode longnames to characters :
22+ Named Characters to Unicode :
1623 \[Theta] -> \u03B8
17- escape sequences :
24+ ASCII escape sequence :
1825 \n -> literal \n
1926
20- Also reports trailing \ characters as incomplete.
21-
22- PreScanner works by breaking the partitioning code into stubs.
27+ Trailing backslash characters (\) are reported incomplete.
2328 """
2429
25- def __init__ (self , feeder ):
26- self .feeder = feeder # returns more code when asked
30+ def __init__ (self , feeder : Callable ):
31+ # feeder is a function that returns the next line of the Mathics input
32+ self .feeder = feeder
2733 self .code = feeder .feed () # input code
2834 self .pos = 0 # current position within code
2935
30- def feed (self ):
36+ def feed (self ) -> str :
37+ """
38+ Return the next line of Mathics input
39+ """
3140 return self .feeder .feed ()
3241
3342 def incomplete (self ):
@@ -37,89 +46,142 @@ def incomplete(self):
3746 raise IncompleteSyntaxError ()
3847 self .code += line
3948
40- def scan (self ):
41- # main loop
42- self .stubs = [] # stubs of code to be joined
43- self .start = self .pos # start of current stub
49+ def scan (self ) -> str :
50+ """
51+ Replace escape sequences in self.code. The replacement string is returned.
52+ Note: self.code is not modified.
53+ """
54+
55+ line_fragments = (
56+ []
57+ ) # line fragments to be joined before returning from this method.
58+ self .fragment_start = (
59+ self .pos
60+ ) # start position of line fragment under consideration
61+
62+ def start_new_fragment (pos : int ) -> None :
63+ """
64+ Update position markers to start a new line fragment at ``pos``.
65+ """
66+ self .pos = pos
67+ self .fragment_start = pos
68+
69+ def try_parse_base (start_shift : int , end_shift : int , base : int ) -> None :
70+ """
71+ See if characters self.pos+start_shift .. self.pos+end shift
72+ can be converted to an integer in base ``base``.
73+
74+ If so, we append, the characters before the escape sequence without the
75+ escaping characters like ``\.`` or ``\:``.
76+
77+ We also append the converted integer to ``line_fragments``, and updated
78+ position cursors for a new line fragment.
79+
80+ However, if the conversion fails, error messages are issued and nothing
81+ is updated
82+ """
83+ start , end = self .pos + start_shift , self .pos + end_shift
84+ result = None
85+ if end <= len (self .code ):
86+ text = self .code [start :end ]
87+ try :
88+ result = int (text , base )
89+ except ValueError :
90+ pass # result remains None
91+ if result is None :
92+ last = end - start
93+ if last == 2 :
94+ self .feeder .message ("Syntax" , "sntoct2" )
95+ elif last == 3 :
96+ self .feeder .message ("Syntax" , "sntoct1" )
97+ elif last == 4 :
98+ self .feeder .message ("Syntax" , "snthex" )
99+ else :
100+ raise ValueError ()
101+ self .feeder .message (
102+ "Syntax" , "sntxb" , self .code [self .pos :].rstrip ("\n " )
103+ )
104+ raise ScanError ()
105+ line_fragments .append (self .code [start : self .pos ])
106+ line_fragments .append (chr (result ))
107+
108+ # Set up a new line fragment for the next time we are called.
109+ start_new_fragment (end )
110+
111+ def try_parse_named_character (start_shift : int ):
112+ """Before calling we have matched "\[". Scan to the remaining "]" and
113+ try to match what is found in-between with a known named
114+ character, e.g. "Theta". If we can match this, we store
115+ the unicode character equivalent in ``line_fragments``.
116+ If we can't find a named character, error messages are
117+ issued and we leave ``line_fragments`` untouched.
118+ """
119+ i = self .pos + start_shift
120+ while True :
121+ if i == len (self .code ):
122+ self .incomplete ()
123+ if self .code [i ] == "]" :
124+ break
125+ i += 1
126+
127+ named_character = self .code [self .pos + start_shift : i ]
128+ if named_character .isalpha ():
129+ char = named_characters .get (named_character )
130+ if char is None :
131+ self .feeder .message ("Syntax" , "sntufn" , named_character )
132+ # stay in same line fragment
133+ else :
134+ line_fragments .append (self .code [self .fragment_start : self .pos ])
135+ line_fragments .append (char )
136+ start_new_fragment (i + 1 )
137+
138+ # Stay in same line fragment, but advance the cursor position.
139+ self .pos = i + 1
140+
141+ # In the following loop, we look for and replace escape
142+ # sequences. The current character under consideration is at
143+ # self.code[self.pos]. When an escape sequence is found at
144+ # that position, the previous line_fragment is extracted and
145+ # stored in ``line_fragments``. The start-position marker for the
146+ # next line_fragment is started and self.pos is updated.
147+
44148 while self .pos < len (self .code ):
45149 if self .code [self .pos ] == "\\ " :
150+ # Look for and handle an escape sequence.
46151 if self .pos + 1 == len (self .code ):
47152 self .incomplete ()
48153 c = self .code [self .pos + 1 ]
49154 if c == "|" :
50- self . try_parse_base (2 , 8 , 16 )
155+ try_parse_base (2 , 8 , 16 )
51156 if c == "." :
52- self .try_parse_base (2 , 4 , 16 )
157+ # See if we have a two-digit hexadecimal number.
158+ try_parse_base (2 , 4 , 16 )
53159 elif c == ":" :
54- self .try_parse_base (2 , 6 , 16 )
160+ # See if we have a four-digit hexadecimal number.
161+ try_parse_base (2 , 6 , 16 )
55162 elif c == "[" :
56- self . try_parse_longname (2 )
163+ try_parse_named_character (2 )
57164 elif c in "01234567" :
58- self .try_parse_base (1 , 4 , 8 )
165+ # See if we have an octal number.
166+ try_parse_base (1 , 4 , 8 )
59167 elif c == "\n " :
60168 if self .pos + 2 == len (self .code ):
61169 self .incomplete ()
62- self .stubs .append (self .code [self .start : self .pos ])
63- self .newstub (self .pos + 2 )
170+ self .line_fragments .append (
171+ self .code [self .fragment_start : self .pos ]
172+ )
173+ start_new_fragment (self .pos + 2 )
64174 else :
65- # Two backslashes in succession indicates a single backslash character,
66- # rather than an escape sequence which also starts with a backslash.
67- # Advance the scanning cursor (self.pos) over both backslashes.
175+ # Two backslashes in succession indicates a single backslash character.
176+ # Advance the scanning cursor (self.pos) over both backslashes.
68177 # Also, Python's backslash escape mechanism turns the two backslashes
69178 # into one in length calculations.
70179 self .pos += 2
71180 else :
72181 self .pos += 1
73- self .stubs .append (self .code [self .start :]) # final stub
74- # reduce
75- return "" .join (self .stubs )
76-
77- def newstub (self , pos : int ) -> None :
78- self .pos = pos
79- self .start = pos
80-
81- def try_parse_base (self , start_shift : int , end_shift : int , base : int ) -> None :
82- start , end = self .pos + start_shift , self .pos + end_shift
83- result = None
84- if end <= len (self .code ):
85- text = self .code [start :end ]
86- try :
87- result = int (text , base )
88- except ValueError :
89- pass # result remains None
90- if result is None :
91- last = end - start
92- if last == 2 :
93- self .feeder .message ("Syntax" , "sntoct2" )
94- elif last == 3 :
95- self .feeder .message ("Syntax" , "sntoct1" )
96- elif last == 4 :
97- self .feeder .message ("Syntax" , "snthex" )
98- else :
99- raise ValueError ()
100- self .feeder .message ("Syntax" , "sntxb" , self .code [self .pos :].rstrip ("\n " ))
101- raise ScanError ()
102- self .stubs .append (self .code [self .start : self .pos ])
103- self .stubs .append (chr (result ))
104- self .newstub (end )
105-
106- def try_parse_longname (self , start_shift ):
107- i = self .pos + start_shift
108- while True :
109- if i == len (self .code ):
110- self .incomplete ()
111- if self .code [i ] == "]" :
112- break
113- i += 1
114-
115- longname = self .code [self .pos + start_shift : i ]
116- if longname .isalpha ():
117- char = named_characters .get (longname )
118- if char is None :
119- self .feeder .message ("Syntax" , "sntufn" , longname )
120- pass # stay in same stub
121- else :
122- self .stubs .append (self .code [self .start : self .pos ])
123- self .stubs .append (char )
124- self .newstub (i + 1 )
125- self .pos = i + 1 # stay in same stub but skip ahead
182+
183+ # Close out final line fragment
184+ line_fragments .append (self .code [self .fragment_start :])
185+
186+ # produce and return the replacement string.
187+ return "" .join (line_fragments )
0 commit comments