|
1 | | -# Copyright (c) 2017-2025 Renata Hodovan, Akos Kiss. |
| 1 | +# Copyright (c) 2017-2026 Renata Hodovan, Akos Kiss. |
2 | 2 | # Copyright (c) 2020 Sebastian Kimberk. |
3 | 3 | # |
4 | 4 | # Licensed under the BSD 3-Clause License |
@@ -870,6 +870,23 @@ def chars_from_set(node: RuleContext) -> list[tuple[int, int]]: |
870 | 870 |
|
871 | 871 | return [] |
872 | 872 |
|
| 873 | + def token_from_set_element(set_element: RuleContext) -> list[int]: |
| 874 | + if set_element.TOKEN_REF(): |
| 875 | + name = str(set_element.TOKEN_REF()) |
| 876 | + assert (name,) in graph.vertices, f'Token reference {name} not found in graph.' |
| 877 | + return (name,) |
| 878 | + elif set_element.STRING_LITERAL(): |
| 879 | + name = str(set_element.STRING_LITERAL())[1:-1] |
| 880 | + if name in literal_lookup: |
| 881 | + return literal_lookup[name] |
| 882 | + |
| 883 | + # Create a new unlexer node from unknown strings to expand the set of possible tokens used by parser dot. |
| 884 | + lit_id = graph.add_node(UnlexerRuleNode()) |
| 885 | + literal_lookup[name] = lit_id |
| 886 | + graph.add_edge(frm=lit_id, to=graph.add_node(LiteralNode(src=name))) |
| 887 | + return lit_id |
| 888 | + assert False, 'Unsupported construct in parser not set expression.' |
| 889 | + |
873 | 890 | def unique_charset(ranges: list[tuple[int, int]]) -> int: |
874 | 891 | if not ranges: |
875 | 892 | raise ValueError('Charset must contain at least one range') |
@@ -964,6 +981,18 @@ def isfloat(s: str): |
964 | 981 | except ValueError: |
965 | 982 | return False |
966 | 983 |
|
| 984 | + def _create_token_alternatives(rule_id, token_ids): |
| 985 | + if len(token_ids) == 1: |
| 986 | + graph.add_edge(frm=rule_id, to=token_ids[0]) |
| 987 | + return |
| 988 | + |
| 989 | + alt_id = graph.add_node(AlternationNode(rule_id=rule_id, idx=0, conditions=[1] * len(token_ids))) |
| 990 | + graph.add_edge(frm=rule_id, to=alt_id) |
| 991 | + for i, lexer_id in enumerate(token_ids): |
| 992 | + alternative_id = graph.add_node(AlternativeNode(rule_id=rule_id, alt_idx=0, idx=i)) |
| 993 | + graph.add_edge(frm=alt_id, to=alternative_id) |
| 994 | + graph.add_edge(frm=alternative_id, to=lexer_id) |
| 995 | + |
967 | 996 | # TODO: Typing of build_rule is a nightmare, hence it's postponed. |
968 | 997 | def build_rule(rule, node): |
969 | 998 | lexer_rule = isinstance(rule, UnlexerRuleNode) |
@@ -1114,26 +1143,30 @@ def build_expr(node, parent_id): |
1114 | 1143 | # Create an artificial `_dot` rule with an alternation of all the lexer rules. |
1115 | 1144 | parser_dot_id = graph.add_node(UnparserRuleNode(name='_dot')) |
1116 | 1145 | unlexer_ids = [v.name for vid, v in graph.vertices.items() if isinstance(v, UnlexerRuleNode)] |
1117 | | - alt_id = graph.add_node(AlternationNode(rule_id=parser_dot_id, idx=0, conditions=[1] * len(unlexer_ids))) |
1118 | | - graph.add_edge(frm=parser_dot_id, to=alt_id) |
1119 | | - for i, lexer_id in enumerate(unlexer_ids): |
1120 | | - alternative_id = graph.add_node(AlternativeNode(rule_id=parser_dot_id, alt_idx=0, idx=i)) |
1121 | | - graph.add_edge(frm=alt_id, to=alternative_id) |
1122 | | - graph.add_edge(frm=alternative_id, to=lexer_id) |
| 1146 | + _create_token_alternatives(parser_dot_id, unlexer_ids) |
1123 | 1147 | graph.add_edge(frm=parent_id, to='_dot') |
1124 | 1148 |
|
1125 | 1149 | elif node.notSet(): |
1126 | | - if node.notSet().setElement(): |
1127 | | - not_ranges = chars_from_set(node.notSet().setElement()) |
1128 | | - else: |
1129 | | - not_ranges = [] |
1130 | | - for set_element in node.notSet().blockSet().setElement(): |
1131 | | - not_ranges.extend(chars_from_set(set_element)) |
1132 | | - |
1133 | | - charset = unique_charset(multirange_diff(graph.charsets[dot_charset], sorted(not_ranges, key=lambda x: x[0]))) |
1134 | | - graph.add_edge(frm=parent_id, to=graph.add_node(CharsetNode(rule_id=rule.id, idx=chr_idx[rule.name], charset=charset))) |
1135 | | - chr_idx[rule.name] += 1 |
| 1150 | + if isinstance(node, ANTLRv4Parser.LexerAtomContext): |
| 1151 | + if node.notSet().setElement(): |
| 1152 | + not_ranges = chars_from_set(node.notSet().setElement()) |
| 1153 | + else: |
| 1154 | + not_ranges = [] |
| 1155 | + for set_element in node.notSet().blockSet().setElement(): |
| 1156 | + not_ranges.extend(chars_from_set(set_element)) |
1136 | 1157 |
|
| 1158 | + charset = unique_charset(multirange_diff(graph.charsets[dot_charset], sorted(not_ranges, key=lambda x: x[0]))) |
| 1159 | + graph.add_edge(frm=parent_id, to=graph.add_node(CharsetNode(rule_id=rule.id, idx=chr_idx[rule.name], charset=charset))) |
| 1160 | + chr_idx[rule.name] += 1 |
| 1161 | + else: |
| 1162 | + if node.notSet().setElement(): |
| 1163 | + disabled_tokens = [token_from_set_element(node.notSet().setElement())] |
| 1164 | + else: |
| 1165 | + disabled_tokens = [] |
| 1166 | + for set_element in node.notSet().blockSet().setElement(): |
| 1167 | + disabled_tokens.append(token_from_set_element(set_element)) |
| 1168 | + enabled_token_ids = [v.id for _, v in graph.vertices.items() if isinstance(v, UnlexerRuleNode) and v.id not in disabled_tokens] |
| 1169 | + _create_token_alternatives(parent_id, enabled_token_ids) |
1137 | 1170 | elif isinstance(node, ANTLRv4Parser.LexerAtomContext) and node.characterRange(): |
1138 | 1171 | start, end = character_range_interval(node) |
1139 | 1172 | if lexer_rule: |
|
0 commit comments