Handle unindented comments in get_parsed_tokens()

MyNameIsTrez · MyNameIsTrez · commit 30c5d79659b5 · 2022-02-28T17:25:24.000+01:00
Add get_token_pos() for is_deeper its error when the line is more than 1 lines deeper
Use default values for ini_parser.get_parsed_tokens()
diff --git a/Python/ini_converting/ini_cst_builder.py b/Python/ini_converting/ini_cst_builder.py
@@ -19,8 +19,8 @@ def get_ini_cst(input_folder_path, subfolder_path):
 		if not utils.is_mod_folder_or_subfolder(relative_subfolder): # TODO: Remove this once CCCP has a Mods folder that can be iterated over.
 			continue
 		elif p.is_file() and p.suffix == ".ini" and p.stem != "desktop": # Skip the desktop.ini Windows metadata file.
-			tokens = ini_tokenizer.get_tokens(p.read_text())
-			parsed_portion[name] = ini_parser.get_parsed_tokens(tokens, [], [0])
+			tokens = ini_tokenizer.get_tokens(str(p))
+			parsed_portion[name] = ini_parser.get_parsed_tokens(tokens)
 		elif p.is_dir():
 			parsed_portion[name] = get_ini_cst(input_folder_path, str(p))
 
diff --git a/Python/ini_converting/ini_parser.py b/Python/ini_converting/ini_parser.py
@@ -1,34 +1,41 @@
-def get_parsed_tokens(tokens, parsed, token_idx, depth=0):
+def get_parsed_tokens(tokens, parsed=None, token_idx=None, depth=0):
 	"""
-	start -> tabs -> property -> equals -> value -> newline
-    ^   v            ^                              v
-	|   +------------+                              |
-	+-----------------------------------------------+
+	newline -> start -> property -> equals -> value
+	^                                         v
+	+-----------------------------------------+
 	"""
-	state = "start"
+
+	if parsed == None:
+		parsed = []
+	if token_idx == None:
+		token_idx = [0]
+
+	state = "newline"
 
 	while token_idx[0] < len(tokens):
 		token = tokens[token_idx[0]]
 
-		if state == "start" and token["type"] == "TABS" and is_deeper(depth, token):
+		if state == "newline" and token["type"] == "EXTRA":
+			parsed[-1].append( { "type": "extra", "content": token["content"] } )
+			token_idx[0] += 1
+		elif state == "newline" and token["type"] == "NEWLINES":
+			parsed[-1].append( { "type": "extra", "content": token["content"] } )
+			token_idx[0] += 1
+
+		elif state == "newline" and token["type"] == "TABS" and is_deeper(depth, token):
 			children = { "type": "children", "content": [] }
 			parsed[-1].append(children)
 			get_parsed_tokens(tokens, children["content"], token_idx, depth + 1)
-		elif state == "start" and is_less_deep(depth, token):
+		elif state == "newline" and is_less_deep(depth, token):
 			return
-
-		elif state == "start":
+		elif state == "newline":
 			parsed.append([])
-			state = "not-start"
-		elif state == "not-start" and token["type"] == "TABS":
-			parsed[-1].append( { "type": "extra", "content": token["content"] } )
-			state = "tabs"
-			token_idx[0] += 1
-		elif (state == "not-start" or state == "tabs") and token["type"] == "WORD":
+			state = "start"
+
+		elif state == "start" and token["type"] == "WORD":
 			parsed[-1].append( { "type": "property", "content": token["content"] } )
 			state = "property"
 			token_idx[0] += 1
-
 		elif state == "property" and token["type"] == "EQUALS":
 			parsed[-1].append( { "type": "extra", "content": token["content"] } )
 			state = "equals"
@@ -39,7 +46,7 @@ def get_parsed_tokens(tokens, parsed, token_idx, depth=0):
 			token_idx[0] += 1
 		elif state == "value" and token["type"] == "NEWLINES":
 			parsed[-1].append( { "type": "extra", "content": token["content"] } )
-			state = "start"
+			state = "newline"
 			token_idx[0] += 1
 
 		else:
@@ -54,9 +61,28 @@ def is_less_deep(depth, token):
 
 
 def is_deeper(depth, token):
-	# TODO: This should throw an error if it's deeper by more than 1.
-	return get_depth(token) > depth
+	new_depth = get_depth(token)
+	if new_depth > depth + 1:
+		line, column = get_token_pos(token)
+		raise ValueError(f"Too many tabs found at line {line}, column {column} in {token['filepath']}")
+	return new_depth > depth
 
 
 def get_depth(token):
 	return len(token["content"]) if token["type"] == "TABS" else 0
+
+
+def get_token_pos(token):
+	with open(token["filepath"], "r") as f:
+		text = f.read()
+	i = 0
+	line = 1
+	column = 1
+	while i < token["index"]:
+		if text[i] == '\n':
+			line += 1
+			column = 0
+		else:
+			column += 1
+		i += 1
+	return line, column
diff --git a/Python/ini_converting/ini_parser_tests.py b/Python/ini_converting/ini_parser_tests.py
@@ -69,6 +69,6 @@ def parser_tests():
 
 def test(filename, expected):
 	text = tests.read_test(filename)
-	tokens = ini_tokenizer.get_tokens(text)
-	ini_cst = ini_parser.get_parsed_tokens(tokens, [], [0])
-	tests.test(text, ini_cst, expected)
+	tokens = ini_tokenizer.get_tokens(filename)
+	ini_cst = ini_parser.get_parsed_tokens(tokens)
+	tests.test(text, ini_cst, expected)
diff --git a/Python/ini_converting/ini_tokenizer.py b/Python/ini_converting/ini_tokenizer.py
@@ -1,52 +1,55 @@
-def get_tokens(text):
+def get_tokens(filepath):
 	tokens = []
 
+	with open(filepath, "r") as f:
+		text = f.read()
+
 	text_len = len(text)
 
 	i = 0
 	while i < text_len:
 		char = text[i]
 
 		if char == "/":
-			i = tokenize_comment(i, text_len, text, tokens)
+			i = tokenize_comment(i, text_len, text, tokens, filepath)
 		elif char == "\t":
-			i = tokenize_tabs(i, text_len, text, tokens)
+			i = tokenize_tabs(i, text_len, text, tokens, filepath)
 		elif char == " ":
-			i = tokenize_spaces(i, text_len, text, tokens)
+			i = tokenize_spaces(i, text_len, text, tokens, filepath)
 		elif char == "=":
-			i = tokenize_equals(i, text_len, text, tokens)
+			i = tokenize_equals(i, text_len, text, tokens, filepath)
 		elif char == "\n":
-			i = tokenize_newline(i, text_len, text, tokens)
+			i = tokenize_newline(i, text_len, text, tokens, filepath)
 		else:
-			i = tokenize_word(i, text_len, text, tokens)
+			i = tokenize_word(i, text_len, text, tokens, filepath)
 
 	return tokens
 
 
-def get_token(type_, content):
-	return { "type": type_, "content": content }
+def get_token(type_, content, i, filepath):
+	return { "type": type_, "content": content, "index": i, "filepath": filepath }
 
 
-def tokenize_comment(i, text_len, text, tokens):
+def tokenize_comment(i, text_len, text, tokens, filepath):
 	if i + 1 < text_len and text[i + 1] == "/":
-		return tokenize_single_line_comment(i, text_len, text, tokens)
+		return tokenize_single_line_comment(i, text_len, text, tokens, filepath)
 	else:
-		return tokenize_multi_line_comment(i, text_len, text, tokens)
+		return tokenize_multi_line_comment(i, text_len, text, tokens, filepath)
 
 
-def tokenize_single_line_comment(i, text_len, text, tokens):
+def tokenize_single_line_comment(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] != "\n":
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("EXTRA", token))
+	tokens.append(get_token("EXTRA", token, i, filepath))
 
 	return i
 
 
-def tokenize_multi_line_comment(i, text_len, text, tokens):
+def tokenize_multi_line_comment(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and not (text[i] == "*" and i + 1 < text_len and text[i + 1] == "/"):
@@ -56,66 +59,66 @@ def tokenize_multi_line_comment(i, text_len, text, tokens):
 	token += "*/"
 	i += 2
 
-	tokens.append(get_token("EXTRA", token))
+	tokens.append(get_token("EXTRA", token, i, filepath))
 
 	return i
 
 
-def tokenize_tabs(i, text_len, text, tokens):
+def tokenize_tabs(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] == "\t":
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("TABS", token))
+	tokens.append(get_token("TABS", token, i, filepath))
 
 	return i
 
 
-def tokenize_spaces(i, text_len, text, tokens):
+def tokenize_spaces(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] == " ":
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("EXTRA", token))
+	tokens.append(get_token("EXTRA", token, i, filepath))
 
 	return i
 
 
-def tokenize_equals(i, text_len, text, tokens):
+def tokenize_equals(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] == "=":
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("EQUALS", token))
+	tokens.append(get_token("EQUALS", token, i, filepath))
 
 	return i
 
 
-def tokenize_newline(i, text_len, text, tokens):
+def tokenize_newline(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] == "\n":
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("NEWLINES", token)) # TODO: Maybe use "NEWLINE" instead of the plural version?
+	tokens.append(get_token("NEWLINES", token, i, filepath)) # TODO: Maybe use "NEWLINE" instead of the plural version?
 
 	return i
 
 
-def tokenize_word(i, text_len, text, tokens):
+def tokenize_word(i, text_len, text, tokens, filepath):
 	token = ""
 
 	while i < text_len and text[i] not in ("\t =\n") and not (text[i] == "/" and i + 1 < text_len and text[i + 1] == "/"):
 		token += text[i]
 		i += 1
 
-	tokens.append(get_token("WORD", token))
+	tokens.append(get_token("WORD", token, i, filepath))
 
 	return i
diff --git a/Python/ini_converting/ini_tokenizer_tests.py b/Python/ini_converting/ini_tokenizer_tests.py
@@ -30,4 +30,4 @@ def tokenizer_tests():
 
 def test(filename, expected):
 	text = tests.read_test(filename)
-	tests.test(text, ini_tokenizer.get_tokens(text), expected)
+	tests.test(text, ini_tokenizer.get_tokens(filename), expected)