Skip to content

Commit c6486ea

Browse files
committed
Use regex for tokenizing words
1 parent d9dbb60 commit c6486ea

File tree

1 file changed

+13
-3
lines changed

1 file changed

+13
-3
lines changed

Python/ini_converting/ini_tokenizer.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import re
2+
3+
14
def get_tokens(filepath):
25
tokens = []
36

@@ -115,9 +118,16 @@ def tokenize_newline(i, text_len, text, tokens, filepath):
115118
def tokenize_word(i, text_len, text, tokens, filepath):
116119
token = ""
117120

118-
while i < text_len and text[i] not in ("\t =\n") and not (text[i] == "/" and i + 1 < text_len and text[i + 1] == "/"):
119-
token += text[i]
120-
i += 1
121+
subtext = text[i:]
122+
token = re.match("(\S+([\t\f\v ]*\S+)*)", subtext).group(0)
123+
124+
token = token.split("//", maxsplit=1)[0]
125+
token = token.split("/*", maxsplit=1)[0]
126+
token = token.split("=", maxsplit=1)[0]
127+
128+
token = token.rstrip()
129+
130+
i += len(token)
121131

122132
tokens.append(get_token("WORD", token, i, filepath))
123133

0 commit comments

Comments
 (0)