Skip to content

Commit 697d10a

Browse files
Merge pull request #38 from MarcellPerger1/start-ast
Refactor tokenizer
2 parents 05a2e87 + 6b97cb9 commit 697d10a

File tree

9 files changed

+208
-244
lines changed

9 files changed

+208
-244
lines changed

parser/common/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
from .common import *
22
from .error import * # <^ might add some stuff so `import *`
33
from .str_region import StrRegion # <-- won't add stuff to str_region so not `import *`
4-
from .tree_print import *
4+
# IMPORTant: don't include tree_print here as that causes circular import issue:
5+
# - lexer.tokens imports ..common (for StrRegion)
6+
# - tree_print also loaded from common/__init__.py
7+
# - tree_print needs `Node`... and `Node` needs lexer.tokens

parser/cst/base_node.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
from __future__ import annotations
22

33
from dataclasses import dataclass, field
4+
from typing import TYPE_CHECKING
45

56
from ..common import StrRegion, HasRegion
6-
from ..tokens import Token
7+
8+
if TYPE_CHECKING:
9+
from ..tokens import Token
710

811

912
@dataclass

parser/lexer/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from .tokenizer import (
2-
Tokenizer,
3-
TokenizerError, LocatedTokenizerError,
4-
LocatedMalformedNumberError, MalformedNumberError,
5-
print_tokens, format_tokens)
1+
from .tokenizer import Tokenizer
2+
from .token_print import print_tokens, format_tokens
3+
from .errors import (
4+
TokenizerError, LocatedTokenizerError, MalformedNumberError,
5+
LocatedMalformedNumberError)

parser/lexer/errors.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from __future__ import annotations
2+
3+
from parser.common import BaseParseError, BaseLocatedError
4+
5+
6+
class TokenizerError(BaseParseError):
7+
...
8+
9+
10+
class LocatedTokenizerError(BaseLocatedError, TokenizerError):
11+
...
12+
13+
14+
class MalformedNumberError(TokenizerError):
15+
...
16+
17+
18+
class LocatedMalformedNumberError(LocatedTokenizerError, MalformedNumberError):
19+
...

parser/lexer/number_parser.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from __future__ import annotations
2+
3+
from string import digits
4+
5+
from .errors import LocatedMalformedNumberError
6+
from .src_handler import UsesSrc
7+
from .tokens import Token, NumberToken
8+
from ..common import StrRegion
9+
10+
11+
class NumberParser(UsesSrc):
12+
default_err_type = LocatedMalformedNumberError
13+
14+
# todo 0x, 0b (I refuse to add octal literals) - also hex floats???
15+
def _parse_digit_seq(self, start: int) -> int | None:
16+
# (Returns None if no digits)
17+
idx = start
18+
if self.get(idx) == '_':
19+
raise self.err("Can't have '_' at the start of a number", idx)
20+
if self.get(idx) not in digits:
21+
return None
22+
idx += 1
23+
while True:
24+
if self.get(idx) == '_':
25+
if self.get(idx + 1) in digits:
26+
idx += 2 # '_' and digit
27+
elif self.get(idx + 1) == '_':
28+
raise self.err(
29+
"Can only have one consecutive '_' in a number", idx + 1)
30+
else:
31+
raise self.err(
32+
"Can't have '_' at the end of a number", idx)
33+
elif self.get(idx) in digits:
34+
idx += 1
35+
else:
36+
return idx # end of digits/'_'
37+
38+
def _parse_num_no_exp(self, idx: int) -> int:
39+
new_idx = self._parse_digit_seq(idx)
40+
if new_idx is None:
41+
if self.get(idx) != '.':
42+
raise self.err("Number must start with digit or '.' ", idx)
43+
has_pre_dot = False
44+
else:
45+
has_pre_dot = True
46+
idx = new_idx
47+
if self.get(idx) != '.':
48+
# eg: 1234, 567e-5, 8 +9-10
49+
return idx
50+
idx += 1
51+
new_idx = self._parse_digit_seq(idx)
52+
if new_idx is None:
53+
has_post_dot = False
54+
else:
55+
has_post_dot = True
56+
idx = new_idx
57+
if has_pre_dot or has_post_dot:
58+
return idx
59+
raise self.err("Number cannot be a single '.' "
60+
"(expected digits before or after)", idx)
61+
62+
def _parse_number(self, idx: int) -> int:
63+
idx = self._parse_num_no_exp(idx)
64+
if self.get(idx).lower() != 'e':
65+
return idx
66+
idx += 1
67+
# need to handle '-' here explicitly as it is part of the number
68+
# so can't just be parsed as a separate operator
69+
if self.get(idx) == '-':
70+
idx += 1
71+
new_idx = self._parse_digit_seq(idx) # no dot after the 'e'
72+
if new_idx is None:
73+
# eg: 1.2eC, 8e-Q which is always an error
74+
raise self.err("Expected integer after <number>e", idx)
75+
idx = new_idx
76+
return idx
77+
78+
def parse(self, start: int) -> Token:
79+
return NumberToken(StrRegion(start, self._parse_number(start)))

parser/lexer/src_handler.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from __future__ import annotations
2+
3+
from typing import Sequence
4+
5+
from .errors import LocatedTokenizerError
6+
from .tokens import Token
7+
from ..common import StrRegion, BaseLocatedError, region_union
8+
9+
10+
class UsesSrc:
11+
def __init__(self, src: str):
12+
self.src: str = src
13+
14+
def __getitem__(self, item: int | slice) -> str:
15+
return self.src[item]
16+
17+
def eof(self, idx: int):
18+
return idx >= len(self.src)
19+
20+
def get(self, idx: int, eof: str = '\0') -> str:
21+
try:
22+
return self.src[idx]
23+
except IndexError:
24+
return eof
25+
26+
default_err_type = LocatedTokenizerError
27+
28+
def err(self, msg: str,
29+
loc: int | Token | StrRegion | Sequence[int | Token | StrRegion],
30+
tp: type[BaseLocatedError] = None):
31+
try:
32+
seq: tuple[int | Token | StrRegion, ...] = tuple(loc)
33+
except TypeError:
34+
seq = (loc,)
35+
region = region_union([
36+
StrRegion(o, o + 1) if isinstance(o, int) else o
37+
for o in seq])
38+
tp = tp or self.default_err_type
39+
return tp(msg, region, self.src)

parser/lexer/token_print.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from __future__ import annotations
2+
3+
import sys
4+
from io import StringIO
5+
from typing import IO
6+
7+
from .tokens import Token
8+
9+
10+
def print_tokens(src: str, tokens: list[Token], stream: IO[str] = None, do_ws=False):
11+
if stream is None:
12+
stream = sys.stdout
13+
table = []
14+
for tok in tokens:
15+
if tok.is_whitespace:
16+
if do_ws:
17+
table.append(['(WS) ' + repr(tok.region.resolve(src)), tok.name])
18+
else:
19+
table.append([str(tok.region.resolve(src)), tok.name])
20+
max0 = max(len(r[0]) for r in table)
21+
max1 = max(len(r[1]) for r in table)
22+
for s0, s1 in table:
23+
print(f'{s0:>{max0}} | {s1:>{max1}}', file=stream)
24+
25+
26+
def format_tokens(src: str, tokens: list[Token], do_ws=False):
27+
out = StringIO()
28+
print_tokens(src, tokens, out, do_ws)
29+
return out.getvalue()

0 commit comments

Comments
 (0)