From 92ecf9c9fcaec90ec66f3c5c91448ef4a9f650a5 Mon Sep 17 00:00:00 2001 From: ksfi Date: Thu, 28 Dec 2023 18:26:53 +0100 Subject: [PATCH 1/6] dedent > 0 + escape sequence --- Tools/cases_generator/lexer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Tools/cases_generator/lexer.py b/Tools/cases_generator/lexer.py index c3c2954a42083f..90121f41c69250 100644 --- a/Tools/cases_generator/lexer.py +++ b/Tools/cases_generator/lexer.py @@ -109,7 +109,7 @@ def choice(*opts: str) -> str: string_char = r"""([^"\\\n]|""" + escape_sequence + ")" str_re = '"' + string_char + '*"' STRING = "STRING" -char = r"\'.\'" # TODO: escape sequence +char = r"\'([^'\\]|\\[0-7]{1,3}|\\x[0-9a-fA-F]+|\\.|\\\\)\'" CHARACTER = "CHARACTER" comment_re = r"(//.*)|/\*([^*]|\*[^/])*\*/" @@ -344,7 +344,16 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: if dedent != 0 and tkn.kind == "COMMENT" and "\n" in text: if dedent < 0: text = text.replace("\n", "\n" + " " * -dedent) - # TODO: dedent > 0 + elif dedent > 0: + ret = [] + for line in text.split("\n"): + leading_space = len(line) - len(line.lstrip()) + if leading_space > dedent: + line = re.sub(r'(?m)^[ \t]{' + str(dedent) + r'}', '', line) + else: + line = re.sub(r'(?m)^[ \t]{' + str(leading_space) + r'}', '', line) + ret.append(line) + text = "\n".join(ret) res.append(text) line, col = tkn.end return "".join(res) From 0871ee8f1419c503cafb604fc56df0098bd11774 Mon Sep 17 00:00:00 2001 From: ksfi Date: Thu, 28 Dec 2023 18:49:26 +0100 Subject: [PATCH 2/6] typing --- Tools/cases_generator/lexer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Tools/cases_generator/lexer.py b/Tools/cases_generator/lexer.py index 90121f41c69250..0f59208c479095 100644 --- a/Tools/cases_generator/lexer.py +++ b/Tools/cases_generator/lexer.py @@ -345,15 +345,15 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: if dedent < 0: text = text.replace("\n", "\n" + " " * -dedent) elif dedent > 0: - ret = [] + temp: list[str] = [] for line in text.split("\n"): - leading_space = len(line) - len(line.lstrip()) + leading_space: int = len(line) - len(line.lstrip()) if leading_space > dedent: line = re.sub(r'(?m)^[ \t]{' + str(dedent) + r'}', '', line) else: line = re.sub(r'(?m)^[ \t]{' + str(leading_space) + r'}', '', line) - ret.append(line) - text = "\n".join(ret) + temp.append(line) + text = "\n".join(temp) res.append(text) line, col = tkn.end return "".join(res) From a041e526fdcdda757ea87c3b06de0d15b9725640 Mon Sep 17 00:00:00 2001 From: ksfi Date: Thu, 28 Dec 2023 19:01:37 +0100 Subject: [PATCH 3/6] less typing --- Tools/cases_generator/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/cases_generator/lexer.py b/Tools/cases_generator/lexer.py index 0f59208c479095..020dfcc41da6ac 100644 --- a/Tools/cases_generator/lexer.py +++ b/Tools/cases_generator/lexer.py @@ -347,7 +347,7 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: elif dedent > 0: temp: list[str] = [] for line in text.split("\n"): - leading_space: int = len(line) - len(line.lstrip()) + leading_space = len(line) - len(line.lstrip()) if leading_space > dedent: line = re.sub(r'(?m)^[ \t]{' + str(dedent) + r'}', '', line) else: From 0ad22c3bce9cbbfd80e808942c6eb61b03e119d3 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 28 Dec 2023 18:12:25 +0000 Subject: [PATCH 4/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Tools-Demos/2023-12-28-18-12-24.gh-issue-113547.TIwp20.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2023-12-28-18-12-24.gh-issue-113547.TIwp20.rst diff --git a/Misc/NEWS.d/next/Tools-Demos/2023-12-28-18-12-24.gh-issue-113547.TIwp20.rst b/Misc/NEWS.d/next/Tools-Demos/2023-12-28-18-12-24.gh-issue-113547.TIwp20.rst new file mode 100644 index 00000000000000..a24361024c6eb7 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2023-12-28-18-12-24.gh-issue-113547.TIwp20.rst @@ -0,0 +1 @@ +TODOs in the cases generator lexer: dedent > 0 case in the to_text function + escape sequence handling From 33cb7a02b47a6ef9d94e30e0eadc62eb5c81a37b Mon Sep 17 00:00:00 2001 From: ksfi Date: Sun, 7 Jan 2024 20:54:06 +0100 Subject: [PATCH 5/6] simpler indent expression + tests --- Lib/test/test_tools/test_lexer_to_text.py | 75 ++++++++++++++++++++++ Lib/test/test_tools/test_lexer_tokenize.py | 50 +++++++++++++++ Tools/cases_generator/lexer.py | 16 ++--- 3 files changed, 133 insertions(+), 8 deletions(-) create mode 100644 Lib/test/test_tools/test_lexer_to_text.py create mode 100644 Lib/test/test_tools/test_lexer_tokenize.py diff --git a/Lib/test/test_tools/test_lexer_to_text.py b/Lib/test/test_tools/test_lexer_to_text.py new file mode 100644 index 00000000000000..88350139189b76 --- /dev/null +++ b/Lib/test/test_tools/test_lexer_to_text.py @@ -0,0 +1,75 @@ +"""Tests for scripts in the Tools directory. + +This file contains regression tests for some of the scripts found in the +Tools directory of a Python checkout or tarball. +""" + +import os +import unittest +from test.support.script_helper import assert_python_ok + +from test.test_tools import toolsdir, skip_if_missing + +skip_if_missing() + +class ReindentTests(unittest.TestCase): + script = os.path.join(toolsdir, 'cases_generator', 'lexer.py') + + def test_multiline_comment_dedent_dedent4(self): + input_code = """ + int main() { + /* + This is a + multi-line comment. + Let's see if it de-indents correctly. + */ + return 0; + } + + """ + + expected_output = """ + int main() { + /* + This is a + multi-line comment. + Let's see if it de-indents correctly. + */ + return 0; +} +""" + + dedent_amount = '4' + rc, out, err = assert_python_ok(self.script, '-c', input_code, dedent_amount) + self.assertEqual(out, bytes(expected_output, 'utf-8')[1:], "Multi-line comment de-indentation failed") + + def test_multiline_comment_dedent_dedent40(self): + input_code = """ + int main() { + /* + This is a + multi-line comment. + Let's see if it de-indents correctly. + */ + return 0; + } + + """ + + expected_output = """ +int main() { +/* +This is a +multi-line comment. +Let's see if it de-indents correctly. +*/ +return 0; +} +""" + + dedent_amount = '40' + rc, out, err = assert_python_ok(self.script, '-c', input_code, dedent_amount) + self.assertEqual(out, bytes(expected_output, 'utf-8')[1:], "Multi-line comment de-indentation failed") + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_tools/test_lexer_tokenize.py b/Lib/test/test_tools/test_lexer_tokenize.py new file mode 100644 index 00000000000000..5abf0783a72762 --- /dev/null +++ b/Lib/test/test_tools/test_lexer_tokenize.py @@ -0,0 +1,50 @@ +"""Tests for scripts in the Tools directory. + +This file contains regression tests for some of the scripts found in the +Tools directory of a Python checkout or tarball. +""" + +import os +import unittest +from test.support.script_helper import assert_python_ok +from test.support import findfile + +from test.test_tools import toolsdir, skip_if_missing + +skip_if_missing() + +class TokenizeTests(unittest.TestCase): + script = os.path.join(toolsdir, 'cases_generator', 'lexer.py') + + def test_identifiers(self): + code = "int myVariable = 123;" + expected_out = bytes("INT('int', 1:1:4)\nIDENTIFIER('myVariable', 1:5:15)\nEQUALS('=', 1:16:17)\nNUMBER('123', 1:18:21)\nSEMI(';', 1:21:22)\n", 'utf-8') + rc, out, err = assert_python_ok(self.script, '-c', code) + self.assertEqual(out, expected_out) + + def test_operators(self): + code = "x = y + z;" + expected_out = bytes("IDENTIFIER('x', 1:1:2)\nEQUALS('=', 1:3:4)\nIDENTIFIER('y', 1:5:6)\nPLUS('+', 1:7:8)\nIDENTIFIER('z', 1:9:10)\nSEMI(';', 1:10:11)\n", 'utf-8') + rc, out, err = assert_python_ok(self.script, '-c', code) + self.assertEqual(out, expected_out) + + def test_numbers(self): + code = "int num = 42;" + expected_out = bytes("INT('int', 1:1:4)\nIDENTIFIER('num', 1:5:8)\nEQUALS('=', 1:9:10)\nNUMBER('42', 1:11:13)\nSEMI(';', 1:13:14)\n", 'utf-8') + rc, out, err = assert_python_ok(self.script, '-c', code) + self.assertEqual(out, expected_out) + + def test_strings(self): + code = 'printf("Hello, World!");' + expected_out = bytes("""IDENTIFIER(\'printf\', 1:1:7)\nLPAREN(\'(\', 1:7:8)\nSTRING(\'"Hello, World!"\', 1:8:23)\nRPAREN(\')\', 1:23:24)\nSEMI(\';\', 1:24:25)\n""", 'utf-8') + rc, out, err = assert_python_ok(self.script, '-c', code) + self.assertEqual(out, expected_out) + + def test_characters_with_escape_sequences(self): + code = "char a = '\n'; char b = '\x41'; char c = '\\';" + expected_out = bytes("""CHAR(\'char\', 1:1:5)\nIDENTIFIER(\'a\', 1:6:7)\nEQUALS(\'=\', 1:8:9)\nCHARACTER("\'\\n\'", 1:10:13)\nSEMI(\';\', 1:13:14)\nCHAR(\'char\', 1:15:19)\nIDENTIFIER(\'b\', 1:20:21)\nEQUALS(\'=\', 1:22:23)\nCHARACTER("\'A\'", 1:24:27)\nSEMI(\';\', 1:27:28)\nCHAR(\'char\', 1:29:33)\nIDENTIFIER(\'c\', 1:34:35)\nEQUALS(\'=\', 1:36:37)\nCHARACTER("\'", 1:38:39)\nBACKSLASH(\'\\\\\', 1:39:40)\nCHARACTER("\'", 1:40:41)\nSEMI(\';\', 1:41:42)\n""", 'utf-8') + rc, out, err = assert_python_ok(self.script, '-c', code) + self.assertEqual(out, expected_out) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/Tools/cases_generator/lexer.py b/Tools/cases_generator/lexer.py index 020dfcc41da6ac..c416479fb2c1ae 100644 --- a/Tools/cases_generator/lexer.py +++ b/Tools/cases_generator/lexer.py @@ -347,11 +347,8 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: elif dedent > 0: temp: list[str] = [] for line in text.split("\n"): - leading_space = len(line) - len(line.lstrip()) - if leading_space > dedent: - line = re.sub(r'(?m)^[ \t]{' + str(dedent) + r'}', '', line) - else: - line = re.sub(r'(?m)^[ \t]{' + str(leading_space) + r'}', '', line) + leading_space = len(line) - len(line.lstrip(' ')) + line = line[min(leading_space, dedent):] temp.append(line) text = "\n".join(temp) res.append(text) @@ -367,6 +364,9 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: src = sys.argv[2] else: src = open(filename).read() - # print(to_text(tokenize(src))) - for tkn in tokenize(src, filename=filename): - print(tkn) + + dedent = int(sys.argv[3]) + print(to_text(tokenize(src), dedent)) + + # for tkn in tokenize(src, filename=filename): + # print(tkn) From d7e0295c1345ee4ca3f346aed8972ede4dbf0558 Mon Sep 17 00:00:00 2001 From: ffffffff Date: Fri, 3 Oct 2025 18:09:29 +0200 Subject: [PATCH 6/6] change name test_lexer.py + dead code --- Lib/test/test_tools/{test_lexer_to_text.py => test_lexer.py} | 0 Tools/cases_generator/lexer.py | 5 +---- 2 files changed, 1 insertion(+), 4 deletions(-) rename Lib/test/test_tools/{test_lexer_to_text.py => test_lexer.py} (100%) diff --git a/Lib/test/test_tools/test_lexer_to_text.py b/Lib/test/test_tools/test_lexer.py similarity index 100% rename from Lib/test/test_tools/test_lexer_to_text.py rename to Lib/test/test_tools/test_lexer.py diff --git a/Tools/cases_generator/lexer.py b/Tools/cases_generator/lexer.py index c416479fb2c1ae..cd1076a53fd240 100644 --- a/Tools/cases_generator/lexer.py +++ b/Tools/cases_generator/lexer.py @@ -366,7 +366,4 @@ def to_text(tkns: list[Token], dedent: int = 0) -> str: src = open(filename).read() dedent = int(sys.argv[3]) - print(to_text(tokenize(src), dedent)) - - # for tkn in tokenize(src, filename=filename): - # print(tkn) + print(to_text(tokenize(src), dedent)) \ No newline at end of file