11# -*- coding: utf-8 -*-
22
3- lSingleCharacterSymbols = [ "," , ":" , "(" , ")" , "'" , '"' , "+" , "&" , "-" , "*" , "/" , "<" , ">" , ";" , "=" , "[" , "]" , "?" ]
4- lTwoCharacterSymbols = [ "=>" , "**" , ":=" , "/=" , ">=" , "<=" , "<>" , "??" , "?=" , "?<" , "?>" , "<<" , ">>" , "--" , "/*" , "*/" ]
5- lThreeCharacterSymbols = [ "?/=" , "?<=" , "?>=" ]
6- lFourCharacterSymbols = [ " \\ ?= \\ " ]
3+ lSingleCharacterSymbols = { "," , ":" , "(" , ")" , "'" , '"' , "+" , "&" , "-" , "*" , "/" , "<" , ">" , ";" , "=" , "[" , "]" , "?" }
4+ lTwoCharacterSymbols = { "=>" , "**" , ":=" , "/=" , ">=" , "<=" , "<>" , "??" , "?=" , "?<" , "?>" , "<<" , ">>" , "--" , "/*" , "*/" }
5+ lThreeCharacterSymbols = { "?/=" , "?<=" , "?>=" }
6+ lMultiCharacterSymbols = lTwoCharacterSymbols | lThreeCharacterSymbols
77
8- lStopChars = [" " , "(" , ";" ]
8+ lStopChars = {" " , "(" , ";" }
9+
10+
11+ def build_symbol_prefix_tree (lSymbols ):
12+ dPrefixTree = {}
13+ for sSymbol in lSymbols :
14+ dNode = dPrefixTree
15+ for oChar in sSymbol :
16+ # Return the branch of the prefix tree for this character, or create an empty branch if there isn't one.
17+ dNode = dNode .setdefault (oChar , {})
18+ # Use $ as the marker for the end of a branch.
19+ dNode ["$" ] = sSymbol
20+ return dPrefixTree
21+
22+
23+ dSymbolTree = build_symbol_prefix_tree (lMultiCharacterSymbols )
924
1025
1126def create (sString ):
@@ -17,8 +32,7 @@ def create(sString):
1732 oLine .combine_whitespace ()
1833 oLine .combine_string_literals ()
1934 oLine .combine_backslash_characters_into_symbols ()
20- oLine .combine_three_character_symbols ()
21- oLine .combine_two_character_symbols ()
35+ oLine .combine_symbols_with_prefix_tree ()
2236 oLine .combine_characters_into_words ()
2337 oLine .combine_character_literals ()
2438 oLine .split_natural_numbers ()
@@ -28,7 +42,7 @@ def create(sString):
2842
2943class New :
3044 def __init__ (self , sLine ):
31- self .lChars = convert_string_to_chars (sLine )
45+ self .lChars = list (sLine )
3246
3347 def combine_whitespace (self ):
3448 lReturn = []
@@ -46,6 +60,30 @@ def combine_whitespace(self):
4660
4761 self .lChars = lReturn
4862
63+ def combine_symbols_with_prefix_tree (self ):
64+ lReturn = []
65+ iStart = 0
66+ iNumChars = len (self .lChars )
67+ while iStart < iNumChars :
68+ dNode = dSymbolTree
69+ iEnd = iStart
70+ oLastMatch = None
71+ iPrevEnd = iStart
72+ # Try to match as long a symbol as possible.
73+ while iEnd < iNumChars and self .lChars [iEnd ] in dNode :
74+ dNode = dNode [self .lChars [iEnd ]]
75+ iEnd += 1
76+ if "$" in dNode :
77+ oLastMatch = dNode ["$" ]
78+ iPrevEnd = iEnd
79+ if oLastMatch :
80+ lReturn .append (oLastMatch )
81+ iStart = iPrevEnd
82+ else :
83+ lReturn .append (self .lChars [iStart ])
84+ iStart += 1
85+ self .lChars = lReturn
86+
4987 def combine_backslash_characters_into_symbols (self ):
5088 lReturn = []
5189 sSymbol = ""
@@ -61,48 +99,21 @@ def combine_backslash_characters_into_symbols(self):
6199 lReturn = add_trailing_string (lReturn , sSymbol )
62100 self .lChars = lReturn
63101
64- def combine_three_character_symbols (self ):
65- lReturn = []
66- i = 0
67- while i < len (self .lChars ):
68- sChars = "" .join (self .lChars [i : i + 3 ])
69- if sChars in lThreeCharacterSymbols :
70- lReturn .append (sChars )
71- i += 3
72- else :
73- lReturn .append (self .lChars [i ])
74- i += 1
75-
76- self .lChars = lReturn
77-
78- def combine_two_character_symbols (self ):
79- lReturn = []
80- i = 0
81- while i < len (self .lChars ):
82- sChars = "" .join (self .lChars [i : i + 2 ])
83- if sChars in lTwoCharacterSymbols :
84- lReturn .append (sChars )
85- i += 2
86- else :
87- lReturn .append (self .lChars [i ])
88- i += 1
89-
90- self .lChars = lReturn
91-
92102 def combine_characters_into_words (self ):
93103 lReturn = []
94- sTemp = ""
104+ sWord = []
105+
95106 for sChar in self .lChars :
96107 if character_is_part_of_word (sChar ):
97- sTemp += sChar
108+ sWord . append ( sChar )
98109 else :
99- if sTemp != "" :
100- lReturn .append (sTemp )
110+ if sWord :
111+ lReturn .append ("" .join (sWord ))
112+ sWord .clear ()
101113 lReturn .append (sChar )
102- sTemp = ""
103114
104- if len ( sTemp ) != 0 :
105- lReturn .append (sTemp )
115+ if sWord :
116+ lReturn .append ("" . join ( sWord ) )
106117
107118 self .lChars = lReturn
108119
@@ -213,21 +224,15 @@ def find_character_literal_candidates(lQuotes, lChars):
213224
214225def is_character_literal_candidate (iIndex , lQuotes , lChars ):
215226 iQuote = lQuotes [iIndex ]
216- if there_is_a_single_token_between_quotes (iIndex , lQuotes ) and token_between_quotes_is_a_single_character (iQuote , lChars ):
217- return True
218- return False
227+ return there_is_a_single_token_between_quotes (iIndex , lQuotes ) and token_between_quotes_is_a_single_character (iQuote , lChars )
219228
220229
221230def there_is_a_single_token_between_quotes (iIndex , lQuotes ):
222- if lQuotes [iIndex ] + 2 == lQuotes [iIndex + 1 ]:
223- return True
224- return False
231+ return lQuotes [iIndex ] + 2 == lQuotes [iIndex + 1 ]
225232
226233
227234def token_between_quotes_is_a_single_character (iQuote , lChars ):
228- if len (lChars [iQuote + 1 ]) == 1 :
229- return True
230- return False
235+ return len (lChars [iQuote + 1 ]) == 1
231236
232237
233238def filter_character_literal_candidates (lCandidates ):
@@ -287,15 +292,11 @@ def append_to_list(bSymbol, lChars, sChar):
287292
288293
289294def backslash_character_found (sChar ):
290- if sChar == "\\ " :
291- return True
292- return False
295+ return sChar == "\\ "
293296
294297
295298def stop_character_found (sChar , bLiteral ):
296- if (sChar in lStopChars or " " in sChar ) and bLiteral :
297- return True
298- return False
299+ return (sChar in lStopChars or " " in sChar ) and bLiteral
299300
300301
301302def add_trailing_string (lReturn , sString ):
@@ -304,21 +305,8 @@ def add_trailing_string(lReturn, sString):
304305 return lReturn
305306
306307
307- def convert_string_to_chars (sString ):
308- lReturn = []
309- for sChar in sString :
310- lReturn .append (sChar )
311- return lReturn
312-
313-
314308def character_is_part_of_word (sChar ):
315- if len (sChar ) > 1 :
316- return False
317- elif sChar .isspace ():
318- return False
319- elif sChar in lSingleCharacterSymbols :
320- return False
321- return True
309+ return len (sChar ) == 1 and not sChar .isspace () and sChar not in lSingleCharacterSymbols
322310
323311
324312def find_indexes_of_double_quote_pairs (lTokens ):
0 commit comments