forked from CodeCrafter-Guy/PyLex
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlexer.py
More file actions
202 lines (170 loc) · 7.34 KB
/
lexer.py
File metadata and controls
202 lines (170 loc) · 7.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
A set of utility functions to tokenize an input string based on a lexer configuration.
This module includes functions for determining delimiters, checking bounds, slicing
strings, and processing tokens based on a given lexer configuration.
Functions:
- is_delimiter: Check if a character at a position in the input string is a delimiter.
- is_in_bounds: Check if the current position is within bounds of the input string.
- get_string_slice: Extract a slice of the input string until a delimiter is found.
- process_tokens: Tokenize the input string by matching tokens based on the lexer config.
"""
import re
import sys
def precompile_patterns(lexer_config):
"""
Precompile regex patterns in the lexer configuration for efficient reuse.
This function modifies the lexer_config in place, adding a 'compiled_pattern'
key to each token that has a 'pattern' field. This avoids recompiling the
same regex on every token match during tokenization.
Parameters:
-----------
lexer_config : dict
The lexer configuration dictionary containing token definitions.
Returns:
--------
dict
The modified lexer_config with precompiled patterns.
Example:
lexer_config = {'tokens': [{'type': 'identifier', 'pattern': r'[a-z]+'}]}
precompile_patterns(lexer_config)
# lexer_config['tokens'][0]['compiled_pattern'] is now a compiled regex
"""
for token in lexer_config.get('tokens', []):
if 'pattern' in token and 'compiled_pattern' not in token:
token['compiled_pattern'] = re.compile(token['pattern'])
return lexer_config
def is_delimiter(input, current_position, delimiters):
"""
Check if the character at the current position in the input string is a delimiter.
Parameters:
-----------
input : str
The input string to check.
current_position : int
The current index in the input string.
delimiters : str
A string containing characters that should be considered as delimiters.
Returns:
--------
bool
True if the character at the current position is a delimiter, False otherwise.
Example:
is_delimiter("function test() {}", 13, " (){}") # Returns True for "("
"""
return input[current_position] in delimiters
def is_in_bounds(input, current_position):
"""
Check if the current position is within the bounds of the input string.
Parameters:
-----------
input : str
The input string to check.
current_position : int
The current index in the input string.
Returns:
--------
bool
True if the current_position is within bounds, False otherwise.
Example:
is_in_bounds("function test() {}", 5) # Returns True if 5 is a valid index
"""
return current_position < len(input)
def get_string_slice(input_text, current_position, delimiters):
"""
Slice the input string starting at the current position until a delimiter is found.
Parameters:
-----------
input_text : str
The full input string being tokenized.
current_position : int
The current position in the input string from which to start slicing.
delimiters : str
A string containing all characters that should be treated as delimiters.
Returns:
--------
tuple
A tuple containing:
- slice (str): The slice of the input text up to the first delimiter.
- current_position (int): The updated position after the slice.
Example:
get_string_slice("function test() {}", 9, " (){}") # Returns ('test', 13)
"""
# Find the end position by scanning for the first delimiter
end_position = current_position
while is_in_bounds(input_text, end_position) and not is_delimiter(input_text, end_position, delimiters):
end_position += 1
# Use direct slicing instead of character-by-character concatenation
slice_text = input_text[current_position:end_position]
return slice_text, end_position
def process_tokens(input_text, current_position, lexer_config):
"""
Process tokens from the input string starting at the given current position.
The function loops through the lexer configuration to match tokens based on
specified patterns or values. It checks for delimiters to correctly parse the
input text into tokens and returns the matched token and updated position.
Parameters:
-----------
input_text : str
The input string to tokenize.
current_position : int
The current index in the input string.
lexer_config : dict
The lexer configuration dictionary which defines token types, patterns, and values.
Returns:
--------
tuple
A tuple containing:
- token (dict): A dictionary containing the matched token with 'value' and 'type' keys.
- current_position (int): The updated position in the input text after tokenization.
Example:
lexer_config = {
'tokens': [
{'type': 'keyword', 'value': 'function'},
{'type': 'identifier', 'pattern': r'[a-zA-Z_][a-zA-Z0-9_]*'}
],
'delimiters': ' (){};'
}
process_tokens("function test()", 0, lexer_config)
# Returns ({'value': 'function', 'type': 'keyword'}, 8)
"""
value = ''
token_type = ''
matched = False
if not is_in_bounds(input_text, current_position):
return None, current_position
# Retrieve the list of types that should check for delimiters
delimiter_check_for_types = lexer_config.get('delimiter_check_for_types', [])
for token in lexer_config['tokens']:
if 'value' in token:
token_value = token['value']
end_position = current_position + len(token_value)
proposed_value = input_text[current_position:end_position]
if proposed_value == token_value:
# Decide whether to check for delimiter based on token type
if token['type'] in delimiter_check_for_types:
if end_position >= len(input_text) or input_text[end_position] in lexer_config['delimiters']:
value = token_value
token_type = token['type']
current_position = end_position
matched = True
break
else:
value = token_value
token_type = token['type']
current_position = end_position
matched = True
break
elif 'pattern' in token:
# Use precompiled pattern if available, otherwise compile on the fly
pattern = token.get('compiled_pattern') or re.compile(token['pattern'])
match = pattern.match(input_text[current_position:])
if match:
value = match.group(0)
token_type = token['type']
current_position += len(value)
matched = True
break
if not matched:
print(f"Lexical analysis config could not determine character at position {current_position}: '{input_text[current_position]}'", file=sys.stderr)
current_position += 1 # Skip the character
return {'value': value, 'type': token_type} if matched else None, current_position