11# -*- coding: utf-8 -*-
22
3- lSingleCharacterSymbols = [ "," , ":" , "(" , ")" , "'" , '"' , "+" , "&" , "-" , "*" , "/" , "<" , ">" , ";" , "=" , "[" , "]" , "?" ]
4- lTwoCharacterSymbols = [ "=>" , "**" , ":=" , "/=" , ">=" , "<=" , "<>" , "??" , "?=" , "?<" , "?>" , "<<" , ">>" , "--" , "/*" , "*/" ]
5- lThreeCharacterSymbols = [ "?/=" , "?<=" , "?>=" ]
6- lFourCharacterSymbols = [ " \\ ?= \\ " ]
3+ lSingleCharacterSymbols = { "," , ":" , "(" , ")" , "'" , '"' , "+" , "&" , "-" , "*" , "/" , "<" , ">" , ";" , "=" , "[" , "]" , "?" }
4+ lTwoCharacterSymbols = { "=>" , "**" , ":=" , "/=" , ">=" , "<=" , "<>" , "??" , "?=" , "?<" , "?>" , "<<" , ">>" , "--" , "/*" , "*/" }
5+ lThreeCharacterSymbols = { "?/=" , "?<=" , "?>=" }
6+ lMultiCharacterSymbols = lTwoCharacterSymbols | lThreeCharacterSymbols
77
8- lStopChars = [" " , "(" , ";" ]
8+ lStopChars = {" " , "(" , ";" }
9+
10+
11+ def build_symbol_prefix_tree (lSymbols ):
12+ dPrefixTree = {}
13+ for sSymbol in lSymbols :
14+ dNode = dPrefixTree
15+ for oChar in sSymbol :
16+ # Return the branch of the prefix tree for this character, or create an empty branch if there isn't one.
17+ dNode = dNode .setdefault (oChar , {})
18+ # Use $ as the marker for the end of a branch.
19+ dNode ["$" ] = sSymbol
20+ return dPrefixTree
21+
22+
23+ dSymbolTree = build_symbol_prefix_tree (lMultiCharacterSymbols )
924
1025
1126def create (sString ):
@@ -17,8 +32,7 @@ def create(sString):
1732 oLine .combine_whitespace ()
1833 oLine .combine_string_literals ()
1934 oLine .combine_backslash_characters_into_symbols ()
20- oLine .combine_three_character_symbols ()
21- oLine .combine_two_character_symbols ()
35+ oLine .combine_symbols_with_prefix_tree ()
2236 oLine .combine_characters_into_words ()
2337 oLine .combine_character_literals ()
2438 oLine .split_natural_numbers ()
@@ -28,7 +42,7 @@ def create(sString):
2842
2943class New :
3044 def __init__ (self , sLine ):
31- self .lChars = convert_string_to_chars (sLine )
45+ self .lChars = list (sLine )
3246
3347 def combine_whitespace (self ):
3448 lReturn = []
@@ -46,6 +60,30 @@ def combine_whitespace(self):
4660
4761 self .lChars = lReturn
4862
63+ def combine_symbols_with_prefix_tree (self ):
64+ lReturn = []
65+ iStart = 0
66+ iNumChars = len (self .lChars )
67+ while iStart < iNumChars :
68+ dNode = dSymbolTree
69+ iEnd = iStart
70+ oLastMatch = None
71+ iPrevEnd = iStart
72+ # Try to match as long a symbol as possible.
73+ while iEnd < iNumChars and self .lChars [iEnd ] in dNode :
74+ dNode = dNode [self .lChars [iEnd ]]
75+ iEnd += 1
76+ if "$" in dNode :
77+ oLastMatch = dNode ["$" ]
78+ iPrevEnd = iEnd
79+ if oLastMatch :
80+ lReturn .append (oLastMatch )
81+ iStart = iPrevEnd
82+ else :
83+ lReturn .append (self .lChars [iStart ])
84+ iStart += 1
85+ self .lChars = lReturn
86+
4987 def combine_backslash_characters_into_symbols (self ):
5088 lReturn = []
5189 sSymbol = ""
@@ -61,48 +99,21 @@ def combine_backslash_characters_into_symbols(self):
6199 lReturn = add_trailing_string (lReturn , sSymbol )
62100 self .lChars = lReturn
63101
64- def combine_three_character_symbols (self ):
65- lReturn = []
66- i = 0
67- while i < len (self .lChars ):
68- sChars = "" .join (self .lChars [i : i + 3 ])
69- if sChars in lThreeCharacterSymbols :
70- lReturn .append (sChars )
71- i += 3
72- else :
73- lReturn .append (self .lChars [i ])
74- i += 1
75-
76- self .lChars = lReturn
77-
78- def combine_two_character_symbols (self ):
79- lReturn = []
80- i = 0
81- while i < len (self .lChars ):
82- sChars = "" .join (self .lChars [i : i + 2 ])
83- if sChars in lTwoCharacterSymbols :
84- lReturn .append (sChars )
85- i += 2
86- else :
87- lReturn .append (self .lChars [i ])
88- i += 1
89-
90- self .lChars = lReturn
91-
92102 def combine_characters_into_words (self ):
93103 lReturn = []
94- sTemp = ""
104+ sWord = []
105+
95106 for sChar in self .lChars :
96107 if character_is_part_of_word (sChar ):
97- sTemp += sChar
108+ sWord . append ( sChar )
98109 else :
99- if sTemp != "" :
100- lReturn .append (sTemp )
110+ if sWord :
111+ lReturn .append ("" .join (sWord ))
112+ sWord .clear ()
101113 lReturn .append (sChar )
102- sTemp = ""
103114
104- if len ( sTemp ) != 0 :
105- lReturn .append (sTemp )
115+ if sWord :
116+ lReturn .append ("" . join ( sWord ) )
106117
107118 self .lChars = lReturn
108119
@@ -213,31 +224,23 @@ def find_character_literal_candidates(lQuotes, lChars):
213224
214225def is_character_literal_candidate (iIndex , lQuotes , lChars ):
215226 iQuote = lQuotes [iIndex ]
216- if (
227+ return (
217228 there_is_a_single_token_between_quotes (iIndex , lQuotes )
218229 and token_between_quotes_is_a_single_character (iQuote , lChars )
219230 and token_is_not_a_parenthesis (iQuote , lChars )
220- ):
221- return True
222- return False
231+ )
223232
224233
225234def there_is_a_single_token_between_quotes (iIndex , lQuotes ):
226- if lQuotes [iIndex ] + 2 == lQuotes [iIndex + 1 ]:
227- return True
228- return False
235+ return lQuotes [iIndex ] + 2 == lQuotes [iIndex + 1 ]
229236
230237
231238def token_between_quotes_is_a_single_character (iQuote , lChars ):
232- if len (lChars [iQuote + 1 ]) == 1 :
233- return True
234- return False
239+ return len (lChars [iQuote + 1 ]) == 1
235240
236241
237242def token_is_not_a_parenthesis (iQuote , lChars ):
238- if lChars [iQuote + 1 ] == "(" :
239- return False
240- return True
243+ return lChars [iQuote + 1 ] != "("
241244
242245
243246def filter_character_literal_candidates (lLiterals ):
@@ -272,15 +275,11 @@ def append_to_list(bSymbol, lChars, sChar):
272275
273276
274277def backslash_character_found (sChar ):
275- if sChar == "\\ " :
276- return True
277- return False
278+ return sChar == "\\ "
278279
279280
280281def stop_character_found (sChar , bLiteral ):
281- if (sChar in lStopChars or " " in sChar ) and bLiteral :
282- return True
283- return False
282+ return (sChar in lStopChars or " " in sChar ) and bLiteral
284283
285284
286285def add_trailing_string (lReturn , sString ):
@@ -289,21 +288,8 @@ def add_trailing_string(lReturn, sString):
289288 return lReturn
290289
291290
292- def convert_string_to_chars (sString ):
293- lReturn = []
294- for sChar in sString :
295- lReturn .append (sChar )
296- return lReturn
297-
298-
299291def character_is_part_of_word (sChar ):
300- if len (sChar ) > 1 :
301- return False
302- elif sChar .isspace ():
303- return False
304- elif sChar in lSingleCharacterSymbols :
305- return False
306- return True
292+ return len (sChar ) == 1 and not sChar .isspace () and sChar not in lSingleCharacterSymbols
307293
308294
309295def find_indexes_of_double_quote_pairs (lTokens ):
0 commit comments