Skip to content

Commit 0a70381

Browse files
committed
Issue#1537: Replaced combine_three_character_symbols and combine_two_character_symbols with symbol prefix tree check.
1 parent 4019c18 commit 0a70381

File tree

1 file changed

+44
-38
lines changed

1 file changed

+44
-38
lines changed

vsg/tokens.py

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,26 @@
33
lSingleCharacterSymbols = {",", ":", "(", ")", "'", '"', "+", "&", "-", "*", "/", "<", ">", ";", "=", "[", "]", "?"}
44
lTwoCharacterSymbols = {"=>", "**", ":=", "/=", ">=", "<=", "<>", "??", "?=", "?<", "?>", "<<", ">>", "--", "/*", "*/"}
55
lThreeCharacterSymbols = {"?/=", "?<=", "?>="}
6-
lFourCharacterSymbols = {"\\?=\\"}
6+
lMultiCharacterSymbols = lTwoCharacterSymbols | lThreeCharacterSymbols
77

88
lStopChars = {" ", "(", ";"}
99

1010

11+
def build_symbol_prefix_tree(lSymbols):
12+
dPrefixTree = {}
13+
for sSymbol in lSymbols:
14+
dNode = dPrefixTree
15+
for oChar in sSymbol:
16+
# Return the branch of the prefix tree for this character, or create an empty branch if there isn't one.
17+
dNode = dNode.setdefault(oChar, {})
18+
# Use $ as the marker for the end of a branch.
19+
dNode["$"] = sSymbol
20+
return dPrefixTree
21+
22+
23+
dSymbolTree = build_symbol_prefix_tree(lMultiCharacterSymbols)
24+
25+
1126
def create(sString):
1227
"""
1328
This function takes a string and returns a list of tokens.
@@ -17,8 +32,7 @@ def create(sString):
1732
oLine.combine_whitespace()
1833
oLine.combine_string_literals()
1934
oLine.combine_backslash_characters_into_symbols()
20-
oLine.combine_three_character_symbols()
21-
oLine.combine_two_character_symbols()
35+
oLine.combine_symbols_with_prefix_tree()
2236
oLine.combine_characters_into_words()
2337
oLine.combine_character_literals()
2438
oLine.split_natural_numbers()
@@ -46,6 +60,30 @@ def combine_whitespace(self):
4660

4761
self.lChars = lReturn
4862

63+
def combine_symbols_with_prefix_tree(self):
64+
lReturn = []
65+
iStart = 0
66+
iNumChars = len(self.lChars)
67+
while iStart < iNumChars:
68+
dNode = dSymbolTree
69+
iEnd = iStart
70+
oLastMatch = None
71+
iPrevEnd = iStart
72+
# Try to match as long a symbol as possible.
73+
while iEnd < iNumChars and self.lChars[iEnd] in dNode:
74+
dNode = dNode[self.lChars[iEnd]]
75+
iEnd += 1
76+
if "$" in dNode:
77+
oLastMatch = dNode["$"]
78+
iPrevEnd = iEnd
79+
if oLastMatch:
80+
lReturn.append(oLastMatch)
81+
iStart = iPrevEnd
82+
else:
83+
lReturn.append(self.lChars[iStart])
84+
iStart += 1
85+
self.lChars = lReturn
86+
4987
def combine_backslash_characters_into_symbols(self):
5088
lReturn = []
5189
sSymbol = ""
@@ -61,34 +99,6 @@ def combine_backslash_characters_into_symbols(self):
6199
lReturn = add_trailing_string(lReturn, sSymbol)
62100
self.lChars = lReturn
63101

64-
def combine_three_character_symbols(self):
65-
lReturn = []
66-
i = 0
67-
while i < len(self.lChars):
68-
sChars = "".join(self.lChars[i : i + 3])
69-
if sChars in lThreeCharacterSymbols:
70-
lReturn.append(sChars)
71-
i += 3
72-
else:
73-
lReturn.append(self.lChars[i])
74-
i += 1
75-
76-
self.lChars = lReturn
77-
78-
def combine_two_character_symbols(self):
79-
lReturn = []
80-
i = 0
81-
while i < len(self.lChars):
82-
sChars = "".join(self.lChars[i : i + 2])
83-
if sChars in lTwoCharacterSymbols:
84-
lReturn.append(sChars)
85-
i += 2
86-
else:
87-
lReturn.append(self.lChars[i])
88-
i += 1
89-
90-
self.lChars = lReturn
91-
92102
def combine_characters_into_words(self):
93103
lReturn = []
94104
sWord = []
@@ -98,12 +108,12 @@ def combine_characters_into_words(self):
98108
sWord.append(sChar)
99109
else:
100110
if sWord:
101-
lReturn.append(''.join(sWord))
111+
lReturn.append("".join(sWord))
102112
sWord.clear()
103113
lReturn.append(sChar)
104114

105115
if sWord:
106-
lReturn.append(''.join(sWord))
116+
lReturn.append("".join(sWord))
107117

108118
self.lChars = lReturn
109119

@@ -279,11 +289,7 @@ def add_trailing_string(lReturn, sString):
279289

280290

281291
def character_is_part_of_word(sChar):
282-
return (
283-
len(sChar) == 1
284-
and not sChar.isspace()
285-
and sChar not in lSingleCharacterSymbols
286-
)
292+
return len(sChar) == 1 and not sChar.isspace() and sChar not in lSingleCharacterSymbols
287293

288294

289295
def find_indexes_of_double_quote_pairs(lTokens):

0 commit comments

Comments
 (0)