Skip to content

Commit 15bb80b

Browse files
committed
Support "not set" constructs in parser rules as well
Fixes #375.
1 parent ec27a89 commit 15bb80b

File tree

3 files changed

+101
-17
lines changed

3 files changed

+101
-17
lines changed

grammarinator/tool/processor.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2017-2025 Renata Hodovan, Akos Kiss.
1+
# Copyright (c) 2017-2026 Renata Hodovan, Akos Kiss.
22
# Copyright (c) 2020 Sebastian Kimberk.
33
#
44
# Licensed under the BSD 3-Clause License
@@ -870,6 +870,23 @@ def chars_from_set(node: RuleContext) -> list[tuple[int, int]]:
870870

871871
return []
872872

873+
def token_from_set_element(set_element: RuleContext) -> list[int]:
874+
if set_element.TOKEN_REF():
875+
name = str(set_element.TOKEN_REF())
876+
assert (name,) in graph.vertices, f'Token reference {name} not found in graph.'
877+
return (name,)
878+
if set_element.STRING_LITERAL():
879+
name = str(set_element.STRING_LITERAL())[1:-1]
880+
if name in literal_lookup:
881+
return literal_lookup[name]
882+
883+
# Create a new unlexer node from unknown strings to expand the set of possible tokens used by parser dot.
884+
lit_id = graph.add_node(UnlexerRuleNode())
885+
literal_lookup[name] = lit_id
886+
graph.add_edge(frm=lit_id, to=graph.add_node(LiteralNode(src=name)))
887+
return lit_id
888+
assert False, 'Unsupported construct in parser not set expression.'
889+
873890
def unique_charset(ranges: list[tuple[int, int]]) -> int:
874891
if not ranges:
875892
raise ValueError('Charset must contain at least one range')
@@ -964,6 +981,18 @@ def isfloat(s: str):
964981
except ValueError:
965982
return False
966983

984+
def _create_token_alternatives(rule_id, token_ids):
985+
if len(token_ids) == 1:
986+
graph.add_edge(frm=rule_id, to=token_ids[0])
987+
return
988+
989+
alt_id = graph.add_node(AlternationNode(rule_id=rule_id, idx=0, conditions=[1] * len(token_ids)))
990+
graph.add_edge(frm=rule_id, to=alt_id)
991+
for i, lexer_id in enumerate(token_ids):
992+
alternative_id = graph.add_node(AlternativeNode(rule_id=rule_id, alt_idx=0, idx=i))
993+
graph.add_edge(frm=alt_id, to=alternative_id)
994+
graph.add_edge(frm=alternative_id, to=lexer_id)
995+
967996
# TODO: Typing of build_rule is a nightmare, hence it's postponed.
968997
def build_rule(rule, node):
969998
lexer_rule = isinstance(rule, UnlexerRuleNode)
@@ -1114,26 +1143,30 @@ def build_expr(node, parent_id):
11141143
# Create an artificial `_dot` rule with an alternation of all the lexer rules.
11151144
parser_dot_id = graph.add_node(UnparserRuleNode(name='_dot'))
11161145
unlexer_ids = [v.name for vid, v in graph.vertices.items() if isinstance(v, UnlexerRuleNode)]
1117-
alt_id = graph.add_node(AlternationNode(rule_id=parser_dot_id, idx=0, conditions=[1] * len(unlexer_ids)))
1118-
graph.add_edge(frm=parser_dot_id, to=alt_id)
1119-
for i, lexer_id in enumerate(unlexer_ids):
1120-
alternative_id = graph.add_node(AlternativeNode(rule_id=parser_dot_id, alt_idx=0, idx=i))
1121-
graph.add_edge(frm=alt_id, to=alternative_id)
1122-
graph.add_edge(frm=alternative_id, to=lexer_id)
1146+
_create_token_alternatives(parser_dot_id, unlexer_ids)
11231147
graph.add_edge(frm=parent_id, to='_dot')
11241148

11251149
elif node.notSet():
1126-
if node.notSet().setElement():
1127-
not_ranges = chars_from_set(node.notSet().setElement())
1128-
else:
1129-
not_ranges = []
1130-
for set_element in node.notSet().blockSet().setElement():
1131-
not_ranges.extend(chars_from_set(set_element))
1132-
1133-
charset = unique_charset(multirange_diff(graph.charsets[dot_charset], sorted(not_ranges, key=lambda x: x[0])))
1134-
graph.add_edge(frm=parent_id, to=graph.add_node(CharsetNode(rule_id=rule.id, idx=chr_idx[rule.name], charset=charset)))
1135-
chr_idx[rule.name] += 1
1150+
if isinstance(node, ANTLRv4Parser.LexerAtomContext):
1151+
if node.notSet().setElement():
1152+
not_ranges = chars_from_set(node.notSet().setElement())
1153+
else:
1154+
not_ranges = []
1155+
for set_element in node.notSet().blockSet().setElement():
1156+
not_ranges.extend(chars_from_set(set_element))
11361157

1158+
charset = unique_charset(multirange_diff(graph.charsets[dot_charset], sorted(not_ranges, key=lambda x: x[0])))
1159+
graph.add_edge(frm=parent_id, to=graph.add_node(CharsetNode(rule_id=rule.id, idx=chr_idx[rule.name], charset=charset)))
1160+
chr_idx[rule.name] += 1
1161+
else:
1162+
if node.notSet().setElement():
1163+
disabled_tokens = [token_from_set_element(node.notSet().setElement())]
1164+
else:
1165+
disabled_tokens = []
1166+
for set_element in node.notSet().blockSet().setElement():
1167+
disabled_tokens.append(token_from_set_element(set_element))
1168+
enabled_token_ids = [v.id for _, v in graph.vertices.items() if isinstance(v, UnlexerRuleNode) and v.id not in disabled_tokens]
1169+
_create_token_alternatives(parent_id, enabled_token_ids)
11371170
elif isinstance(node, ANTLRv4Parser.LexerAtomContext) and node.characterRange():
11381171
start, end = character_range_interval(node)
11391172
if lexer_rule:

tests/grammars-cxx/ParserNotSet.g4

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (c) 2026 Renata Hodovan, Akos Kiss.
3+
*
4+
* Licensed under the BSD 3-Clause License
5+
* <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
6+
* This file may not be copied, modified, or distributed except
7+
* according to those terms.
8+
*/
9+
10+
/*
11+
* The test checks the handling of the "not set" consturct in parser grammars.
12+
*/
13+
14+
// TEST-PROCESS-CXX: {grammar}.g4 -o {tmpdir}
15+
// TEST-BUILD-CXX: --generator={grammar}Generator --includedir={tmpdir} --builddir={tmpdir}/build
16+
// TEST-GENERATE-CXX: {tmpdir}/build/bin/grammarinator-generate-{grammar_lower} -r start -o {tmpdir}/{grammar}.txt
17+
// TEST-ANTLR: {grammar}.g4 -o {tmpdir}
18+
// TEST-REPARSE: -p {grammar}Parser -l {grammar}Lexer -r start {tmpdir}/{grammar}%d.txt
19+
20+
grammar ParserNotSet;
21+
22+
start: ~(A | 'B') ~C EOF;
23+
24+
A : 'A';
25+
B : 'B';
26+
C : 'C';

tests/grammars/ParserNotSet.g4

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright (c) 2026 Renata Hodovan, Akos Kiss.
3+
*
4+
* Licensed under the BSD 3-Clause License
5+
* <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
6+
* This file may not be copied, modified, or distributed except
7+
* according to those terms.
8+
*/
9+
10+
/*
11+
* The test checks the handling of the "not set" consturct in parser grammars.
12+
*/
13+
14+
// TEST-PROCESS: {grammar}.g4 -o {tmpdir}
15+
// TEST-GENERATE: {grammar}Generator.{grammar}Generator -r start -j 1 -o {tmpdir}/{grammar}.txt
16+
// TEST-ANTLR: {grammar}.g4 -o {tmpdir}
17+
// TEST-REPARSE: -p {grammar}Parser -l {grammar}Lexer -r start {tmpdir}/{grammar}%d.txt
18+
19+
grammar ParserNotSet;
20+
21+
start: ~(A | 'B') ~C EOF;
22+
23+
A : 'A';
24+
B : 'B';
25+
C : 'C';

0 commit comments

Comments
 (0)