PyLex/lexers/lexer.py at develop · dweng0/PyLex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
A set of utility functions to tokenize an input string based on a lexer configuration.

This module includes functions for determining delimiters, checking bounds, slicing
strings, and processing tokens based on a given lexer configuration.

Functions:
    - is_delimiter: Check if a character at a position in the input string is a delimiter.
    - is_in_bounds: Check if the current position is within bounds of the input string.
    - get_string_slice: Extract a slice of the input string until a delimiter is found.
    - process_tokens: Tokenize the input string by matching tokens based on the lexer config.
"""

import re
import sys

def precompile_patterns(lexer_config):
    """
    Precompile regex patterns in the lexer configuration for efficient reuse.

    This function modifies the lexer_config in place, adding a 'compiled_pattern'
    key to each token that has a 'pattern' field. This avoids recompiling the
    same regex on every token match during tokenization.

    Parameters:
    -----------
    lexer_config : dict
        The lexer configuration dictionary containing token definitions.

    Returns:
    --------
    dict
        The modified lexer_config with precompiled patterns.

    Example:
        lexer_config = {'tokens': [{'type': 'identifier', 'pattern': r'[a-z]+'}]}
        precompile_patterns(lexer_config)
        # lexer_config['tokens'][0]['compiled_pattern'] is now a compiled regex
    """
    for token in lexer_config.get('tokens', []):
        if 'pattern' in token and 'compiled_pattern' not in token:
            token['compiled_pattern'] = re.compile(token['pattern'])
    return lexer_config

def is_delimiter(input, current_position, delimiters):
    """
    Check if the character at the current position in the input string is a delimiter.

    Parameters:
    -----------
    input : str
        The input string to check.
    current_position : int
        The current index in the input string.
    delimiters : str
        A string containing characters that should be considered as delimiters.

    Returns:
    --------
    bool
        True if the character at the current position is a delimiter, False otherwise.

    Example:
        is_delimiter("function test() {}", 13, " (){}")  # Returns True for "("
    """
    return input[current_position] in delimiters

def is_in_bounds(input, current_position):
    """
    Check if the current position is within the bounds of the input string.

    Parameters:
    -----------
    input : str
        The input string to check.
    current_position : int
        The current index in the input string.

    Returns:
    --------
    bool
        True if the current_position is within bounds, False otherwise.

    Example:
        is_in_bounds("function test() {}", 5)  # Returns True if 5 is a valid index
    """
    return current_position < len(input)

def get_string_slice(input_text, current_position, delimiters):
    """
    Slice the input string starting at the current position until a delimiter is found.

    Parameters:
    -----------
    input_text : str
        The full input string being tokenized.
    current_position : int
        The current position in the input string from which to start slicing.
    delimiters : str
        A string containing all characters that should be treated as delimiters.

    Returns:
    --------
    tuple
        A tuple containing:
        - slice (str): The slice of the input text up to the first delimiter.
        - current_position (int): The updated position after the slice.

    Example:
        get_string_slice("function test() {}", 9, " (){}")  # Returns ('test', 13)
    """
    # Find the end position by scanning for the first delimiter
    end_position = current_position
    while is_in_bounds(input_text, end_position) and not is_delimiter(input_text, end_position, delimiters):
        end_position += 1

    # Use direct slicing instead of character-by-character concatenation
    slice_text = input_text[current_position:end_position]

    return slice_text, end_position

def process_tokens(input_text, current_position, lexer_config):
    """
    Process tokens from the input string starting at the given current position.

    The function loops through the lexer configuration to match tokens based on
    specified patterns or values. It checks for delimiters to correctly parse the
    input text into tokens and returns the matched token and updated position.

    Parameters:
    -----------
    input_text : str
        The input string to tokenize.
    current_position : int
        The current index in the input string.
    lexer_config : dict
        The lexer configuration dictionary which defines token types, patterns, and values.

    Returns:
    --------
    tuple
        A tuple containing:
        - token (dict): A dictionary containing the matched token with 'value' and 'type' keys.
        - current_position (int): The updated position in the input text after tokenization.

    Example:
        lexer_config = {
            'tokens': [
                {'type': 'keyword', 'value': 'function'},
                {'type': 'identifier', 'pattern': r'[a-zA-Z_][a-zA-Z0-9_]*'}
            ],
            'delimiters': ' (){};'
        }
        process_tokens("function test()", 0, lexer_config)
        # Returns ({'value': 'function', 'type': 'keyword'}, 8)
    """
    value = ''
    token_type = ''
    matched = False

    if not is_in_bounds(input_text, current_position):
        return None, current_position

    # Retrieve the list of types that should check for delimiters
    delimiter_check_for_types = lexer_config.get('delimiter_check_for_types', [])

    for token in lexer_config['tokens']:
        if 'value' in token:
            token_value = token['value']
            end_position = current_position + len(token_value)
            proposed_value = input_text[current_position:end_position]
            if proposed_value == token_value:
                # Decide whether to check for delimiter based on token type
                if token['type'] in delimiter_check_for_types:
                    if end_position >= len(input_text) or input_text[end_position] in lexer_config['delimiters']:
                        value = token_value
                        token_type = token['type']
                        current_position = end_position
                        matched = True
                        break
                else:
                    value = token_value
                    token_type = token['type']
                    current_position = end_position
                    matched = True
                    break
        elif 'pattern' in token:
            # Use precompiled pattern if available, otherwise compile on the fly
            pattern = token.get('compiled_pattern') or re.compile(token['pattern'])
            match = pattern.match(input_text[current_position:])
            if match:
                value = match.group(0)
                token_type = token['type']
                current_position += len(value)
                matched = True
                break

    if not matched:
        print(f"Lexical analysis config could not determine character at position {current_position}: '{input_text[current_position]}'", file=sys.stderr)
        current_position += 1  # Skip the character

    return {'value': value, 'type': token_type} if matched else None, current_position