Skip to content

Commit 30c5d79

Browse files
committed
Handle unindented comments in get_parsed_tokens()
Add get_token_pos() for is_deeper its error when the line is more than 1 lines deeper Use default values for ini_parser.get_parsed_tokens()
1 parent 0d58b55 commit 30c5d79

File tree

5 files changed

+81
-52
lines changed

5 files changed

+81
-52
lines changed

Python/ini_converting/ini_cst_builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ def get_ini_cst(input_folder_path, subfolder_path):
1919
if not utils.is_mod_folder_or_subfolder(relative_subfolder): # TODO: Remove this once CCCP has a Mods folder that can be iterated over.
2020
continue
2121
elif p.is_file() and p.suffix == ".ini" and p.stem != "desktop": # Skip the desktop.ini Windows metadata file.
22-
tokens = ini_tokenizer.get_tokens(p.read_text())
23-
parsed_portion[name] = ini_parser.get_parsed_tokens(tokens, [], [0])
22+
tokens = ini_tokenizer.get_tokens(str(p))
23+
parsed_portion[name] = ini_parser.get_parsed_tokens(tokens)
2424
elif p.is_dir():
2525
parsed_portion[name] = get_ini_cst(input_folder_path, str(p))
2626

Python/ini_converting/ini_parser.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,41 @@
1-
def get_parsed_tokens(tokens, parsed, token_idx, depth=0):
1+
def get_parsed_tokens(tokens, parsed=None, token_idx=None, depth=0):
22
"""
3-
start -> tabs -> property -> equals -> value -> newline
4-
^ v ^ v
5-
| +------------+ |
6-
+-----------------------------------------------+
3+
newline -> start -> property -> equals -> value
4+
^ v
5+
+-----------------------------------------+
76
"""
8-
state = "start"
7+
8+
if parsed == None:
9+
parsed = []
10+
if token_idx == None:
11+
token_idx = [0]
12+
13+
state = "newline"
914

1015
while token_idx[0] < len(tokens):
1116
token = tokens[token_idx[0]]
1217

13-
if state == "start" and token["type"] == "TABS" and is_deeper(depth, token):
18+
if state == "newline" and token["type"] == "EXTRA":
19+
parsed[-1].append( { "type": "extra", "content": token["content"] } )
20+
token_idx[0] += 1
21+
elif state == "newline" and token["type"] == "NEWLINES":
22+
parsed[-1].append( { "type": "extra", "content": token["content"] } )
23+
token_idx[0] += 1
24+
25+
elif state == "newline" and token["type"] == "TABS" and is_deeper(depth, token):
1426
children = { "type": "children", "content": [] }
1527
parsed[-1].append(children)
1628
get_parsed_tokens(tokens, children["content"], token_idx, depth + 1)
17-
elif state == "start" and is_less_deep(depth, token):
29+
elif state == "newline" and is_less_deep(depth, token):
1830
return
19-
20-
elif state == "start":
31+
elif state == "newline":
2132
parsed.append([])
22-
state = "not-start"
23-
elif state == "not-start" and token["type"] == "TABS":
24-
parsed[-1].append( { "type": "extra", "content": token["content"] } )
25-
state = "tabs"
26-
token_idx[0] += 1
27-
elif (state == "not-start" or state == "tabs") and token["type"] == "WORD":
33+
state = "start"
34+
35+
elif state == "start" and token["type"] == "WORD":
2836
parsed[-1].append( { "type": "property", "content": token["content"] } )
2937
state = "property"
3038
token_idx[0] += 1
31-
3239
elif state == "property" and token["type"] == "EQUALS":
3340
parsed[-1].append( { "type": "extra", "content": token["content"] } )
3441
state = "equals"
@@ -39,7 +46,7 @@ def get_parsed_tokens(tokens, parsed, token_idx, depth=0):
3946
token_idx[0] += 1
4047
elif state == "value" and token["type"] == "NEWLINES":
4148
parsed[-1].append( { "type": "extra", "content": token["content"] } )
42-
state = "start"
49+
state = "newline"
4350
token_idx[0] += 1
4451

4552
else:
@@ -54,9 +61,28 @@ def is_less_deep(depth, token):
5461

5562

5663
def is_deeper(depth, token):
57-
# TODO: This should throw an error if it's deeper by more than 1.
58-
return get_depth(token) > depth
64+
new_depth = get_depth(token)
65+
if new_depth > depth + 1:
66+
line, column = get_token_pos(token)
67+
raise ValueError(f"Too many tabs found at line {line}, column {column} in {token['filepath']}")
68+
return new_depth > depth
5969

6070

6171
def get_depth(token):
6272
return len(token["content"]) if token["type"] == "TABS" else 0
73+
74+
75+
def get_token_pos(token):
76+
with open(token["filepath"], "r") as f:
77+
text = f.read()
78+
i = 0
79+
line = 1
80+
column = 1
81+
while i < token["index"]:
82+
if text[i] == '\n':
83+
line += 1
84+
column = 0
85+
else:
86+
column += 1
87+
i += 1
88+
return line, column

Python/ini_converting/ini_parser_tests.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,6 @@ def parser_tests():
6969

7070
def test(filename, expected):
7171
text = tests.read_test(filename)
72-
tokens = ini_tokenizer.get_tokens(text)
73-
ini_cst = ini_parser.get_parsed_tokens(tokens, [], [0])
74-
tests.test(text, ini_cst, expected)
72+
tokens = ini_tokenizer.get_tokens(filename)
73+
ini_cst = ini_parser.get_parsed_tokens(tokens)
74+
tests.test(text, ini_cst, expected)
Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,55 @@
1-
def get_tokens(text):
1+
def get_tokens(filepath):
22
tokens = []
33

4+
with open(filepath, "r") as f:
5+
text = f.read()
6+
47
text_len = len(text)
58

69
i = 0
710
while i < text_len:
811
char = text[i]
912

1013
if char == "/":
11-
i = tokenize_comment(i, text_len, text, tokens)
14+
i = tokenize_comment(i, text_len, text, tokens, filepath)
1215
elif char == "\t":
13-
i = tokenize_tabs(i, text_len, text, tokens)
16+
i = tokenize_tabs(i, text_len, text, tokens, filepath)
1417
elif char == " ":
15-
i = tokenize_spaces(i, text_len, text, tokens)
18+
i = tokenize_spaces(i, text_len, text, tokens, filepath)
1619
elif char == "=":
17-
i = tokenize_equals(i, text_len, text, tokens)
20+
i = tokenize_equals(i, text_len, text, tokens, filepath)
1821
elif char == "\n":
19-
i = tokenize_newline(i, text_len, text, tokens)
22+
i = tokenize_newline(i, text_len, text, tokens, filepath)
2023
else:
21-
i = tokenize_word(i, text_len, text, tokens)
24+
i = tokenize_word(i, text_len, text, tokens, filepath)
2225

2326
return tokens
2427

2528

26-
def get_token(type_, content):
27-
return { "type": type_, "content": content }
29+
def get_token(type_, content, i, filepath):
30+
return { "type": type_, "content": content, "index": i, "filepath": filepath }
2831

2932

30-
def tokenize_comment(i, text_len, text, tokens):
33+
def tokenize_comment(i, text_len, text, tokens, filepath):
3134
if i + 1 < text_len and text[i + 1] == "/":
32-
return tokenize_single_line_comment(i, text_len, text, tokens)
35+
return tokenize_single_line_comment(i, text_len, text, tokens, filepath)
3336
else:
34-
return tokenize_multi_line_comment(i, text_len, text, tokens)
37+
return tokenize_multi_line_comment(i, text_len, text, tokens, filepath)
3538

3639

37-
def tokenize_single_line_comment(i, text_len, text, tokens):
40+
def tokenize_single_line_comment(i, text_len, text, tokens, filepath):
3841
token = ""
3942

4043
while i < text_len and text[i] != "\n":
4144
token += text[i]
4245
i += 1
4346

44-
tokens.append(get_token("EXTRA", token))
47+
tokens.append(get_token("EXTRA", token, i, filepath))
4548

4649
return i
4750

4851

49-
def tokenize_multi_line_comment(i, text_len, text, tokens):
52+
def tokenize_multi_line_comment(i, text_len, text, tokens, filepath):
5053
token = ""
5154

5255
while i < text_len and not (text[i] == "*" and i + 1 < text_len and text[i + 1] == "/"):
@@ -56,66 +59,66 @@ def tokenize_multi_line_comment(i, text_len, text, tokens):
5659
token += "*/"
5760
i += 2
5861

59-
tokens.append(get_token("EXTRA", token))
62+
tokens.append(get_token("EXTRA", token, i, filepath))
6063

6164
return i
6265

6366

64-
def tokenize_tabs(i, text_len, text, tokens):
67+
def tokenize_tabs(i, text_len, text, tokens, filepath):
6568
token = ""
6669

6770
while i < text_len and text[i] == "\t":
6871
token += text[i]
6972
i += 1
7073

71-
tokens.append(get_token("TABS", token))
74+
tokens.append(get_token("TABS", token, i, filepath))
7275

7376
return i
7477

7578

76-
def tokenize_spaces(i, text_len, text, tokens):
79+
def tokenize_spaces(i, text_len, text, tokens, filepath):
7780
token = ""
7881

7982
while i < text_len and text[i] == " ":
8083
token += text[i]
8184
i += 1
8285

83-
tokens.append(get_token("EXTRA", token))
86+
tokens.append(get_token("EXTRA", token, i, filepath))
8487

8588
return i
8689

8790

88-
def tokenize_equals(i, text_len, text, tokens):
91+
def tokenize_equals(i, text_len, text, tokens, filepath):
8992
token = ""
9093

9194
while i < text_len and text[i] == "=":
9295
token += text[i]
9396
i += 1
9497

95-
tokens.append(get_token("EQUALS", token))
98+
tokens.append(get_token("EQUALS", token, i, filepath))
9699

97100
return i
98101

99102

100-
def tokenize_newline(i, text_len, text, tokens):
103+
def tokenize_newline(i, text_len, text, tokens, filepath):
101104
token = ""
102105

103106
while i < text_len and text[i] == "\n":
104107
token += text[i]
105108
i += 1
106109

107-
tokens.append(get_token("NEWLINES", token)) # TODO: Maybe use "NEWLINE" instead of the plural version?
110+
tokens.append(get_token("NEWLINES", token, i, filepath)) # TODO: Maybe use "NEWLINE" instead of the plural version?
108111

109112
return i
110113

111114

112-
def tokenize_word(i, text_len, text, tokens):
115+
def tokenize_word(i, text_len, text, tokens, filepath):
113116
token = ""
114117

115118
while i < text_len and text[i] not in ("\t =\n") and not (text[i] == "/" and i + 1 < text_len and text[i + 1] == "/"):
116119
token += text[i]
117120
i += 1
118121

119-
tokens.append(get_token("WORD", token))
122+
tokens.append(get_token("WORD", token, i, filepath))
120123

121124
return i

Python/ini_converting/ini_tokenizer_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ def tokenizer_tests():
3030

3131
def test(filename, expected):
3232
text = tests.read_test(filename)
33-
tests.test(text, ini_tokenizer.get_tokens(text), expected)
33+
tests.test(text, ini_tokenizer.get_tokens(filename), expected)

0 commit comments

Comments
 (0)