Skip to content

Commit 1df4375

Browse files
committed
introduce left-recursive parsing
1 parent 2dc6a97 commit 1df4375

File tree

5 files changed

+286
-25
lines changed

5 files changed

+286
-25
lines changed

crates/formality-core/src/parse.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use std::fmt::Debug;
1414
/// Trait for parsing a [`Term<L>`](`crate::term::Term`) as input.
1515
/// Typically this is auto-generated with the `#[term]` procedural macro,
1616
/// but you can implement it by hand if you want a very customized parse.
17-
pub trait CoreParse<L: Language>: Sized + Debug + Clone + Eq {
17+
pub trait CoreParse<L: Language>: Sized + Debug + Clone + Eq + 'static {
1818
/// Parse a single instance of this type, returning an error if no such
1919
/// instance is present.
2020
///
@@ -58,7 +58,7 @@ where
5858
}
5959

6060
/// Record from a successful parse.
61-
#[derive(Debug, Clone)]
61+
#[derive(Debug, Clone, PartialEq, Eq)]
6262
pub struct SuccessfulParse<'t, T> {
6363
/// The new point in the input, after we've consumed whatever text we have.
6464
text: &'t str,

crates/formality-core/src/parse/parser.rs

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ use crate::{
88

99
use super::{CoreParse, ParseError, ParseResult, Scope, SuccessfulParse, TokenResult};
1010

11+
mod left_recursion;
12+
1113
/// Create this struct when implementing the [`CoreParse`][] trait.
1214
/// Each `Parser` corresponds to some symbol in the grammar.
1315
/// You create a parser and then you invoke the `parse_variant`
@@ -21,6 +23,7 @@ use super::{CoreParse, ParseError, ParseResult, Scope, SuccessfulParse, TokenRes
2123
pub struct Parser<'s, 't, T, L>
2224
where
2325
L: Language,
26+
T: Debug + Clone + Eq + 'static,
2427
{
2528
scope: &'s Scope<L>,
2629
start_text: &'t str,
@@ -59,7 +62,7 @@ where
5962
impl<'s, 't, T, L> Parser<'s, 't, T, L>
6063
where
6164
L: Language,
62-
T: Debug + Eq,
65+
T: Debug + Clone + Eq + 'static,
6366
{
6467
/// Shorthand to create a parser for a nonterminal with a single variant,
6568
/// parsed by the function `op`.
@@ -69,10 +72,10 @@ where
6972
scope: &'s Scope<L>,
7073
text: &'t str,
7174
nonterminal_name: &'static str,
72-
op: impl FnOnce(&mut ActiveVariant<'s, 't, L>) -> Result<T, Set<ParseError<'t>>>,
75+
mut op: impl FnMut(&mut ActiveVariant<'s, 't, L>) -> Result<T, Set<ParseError<'t>>>,
7376
) -> ParseResult<'t, T> {
7477
Parser::multi_variant(scope, text, nonterminal_name, |parser| {
75-
parser.parse_variant(nonterminal_name, 0, op);
78+
parser.parse_variant(nonterminal_name, 0, &mut op);
7679
})
7780
}
7881

@@ -86,28 +89,30 @@ where
8689
scope: &'s Scope<L>,
8790
text: &'t str,
8891
nonterminal_name: &'static str,
89-
op: impl FnOnce(&mut Self),
92+
mut op: impl FnMut(&mut Self),
9093
) -> ParseResult<'t, T> {
91-
let tracing_span = tracing::span!(
92-
tracing::Level::TRACE,
93-
"nonterminal",
94-
name = nonterminal_name,
95-
?scope,
96-
?text
97-
);
98-
let guard = tracing_span.enter();
99-
100-
let mut parser = Self {
101-
scope,
102-
start_text: text,
103-
nonterminal_name,
104-
successes: vec![],
105-
failures: set![],
106-
};
94+
left_recursion::enter(scope, text, || {
95+
let tracing_span = tracing::span!(
96+
tracing::Level::TRACE,
97+
"nonterminal",
98+
name = nonterminal_name,
99+
?scope,
100+
?text
101+
);
102+
let guard = tracing_span.enter();
103+
104+
let mut parser = Self {
105+
scope,
106+
start_text: text,
107+
nonterminal_name,
108+
successes: vec![],
109+
failures: set![],
110+
};
107111

108-
op(&mut parser);
112+
op(&mut parser);
109113

110-
parser.finish(guard)
114+
parser.finish(guard)
115+
})
111116
}
112117

113118
/// Shorthand for `parse_variant` where the parsing operation is to
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
//! Support left-recursive grammars. This is basically just a fixed point
2+
//! operation, but we re-implement it to avoid having to return multiple
3+
//! success values, since we know that's not really needed here.
4+
//!
5+
//! This unfortunately requires unsafe and even `type_id`. This is because
6+
//! we need to be generic over the language `L` and the result type
7+
//! `T` in our `thread_local!` and you can't have generic thread-local values.
8+
//! So we have to erase types. Annoying!
9+
10+
use std::{any::TypeId, cell::RefCell, fmt::Debug};
11+
12+
use crate::{
13+
language::Language,
14+
parse::{ParseError, ParseResult, Scope, SuccessfulParse},
15+
};
16+
17+
thread_local! {
18+
static STACK: RefCell<Vec<StackEntry>> = Default::default()
19+
}
20+
21+
/// Tracks an active parse that is taking place.
22+
struct StackEntry {
23+
/// The scope pointer: we use `()` instead of `Scope<L>`
24+
scope: *const (),
25+
26+
/// The starting text: we use `*const` instead of `&'t str`
27+
start_text: *const str,
28+
29+
/// The TypeId of the type `T`.
30+
type_id: TypeId,
31+
32+
/// The intermediate value produced. If `Some`, this is a pointer
33+
/// to a `SuccessfulParse<'t, T>`.
34+
value: Option<*const ()>,
35+
36+
///
37+
observed: bool,
38+
}
39+
40+
impl StackEntry {
41+
pub fn new<L, T>(scope: &Scope<L>, start_text: &str) -> Self
42+
where
43+
L: Language,
44+
T: Clone + 'static,
45+
{
46+
Self {
47+
scope: erase_type(scope),
48+
start_text,
49+
type_id: TypeId::of::<T>(),
50+
value: None,
51+
observed: false,
52+
}
53+
}
54+
55+
pub fn matches<L, T>(&self, scope: &Scope<L>, start_text: &str) -> bool
56+
where
57+
L: Language,
58+
T: Clone + 'static,
59+
{
60+
let scope: *const () = erase_type(scope);
61+
let start_text: *const str = start_text;
62+
let type_id = TypeId::of::<T>();
63+
scope == self.scope && start_text == self.start_text && self.type_id == type_id
64+
}
65+
66+
/// UNSAFE: Caller must guarantee that `self.value` pointer is valid.
67+
pub unsafe fn observe<'t, T>(&mut self, start_text: &'t str) -> ParseResult<'t, T>
68+
where
69+
T: Clone + 'static,
70+
{
71+
assert_eq!(self.start_text, start_text as *const str);
72+
assert_eq!(self.type_id, TypeId::of::<T>());
73+
74+
self.observed = true;
75+
76+
match self.value {
77+
Some(ptr) => {
78+
let ptr = ptr as *const SuccessfulParse<'t, T>;
79+
// UNSAFE: We rely on the caller to entry ptr is valid.
80+
let ptr = unsafe { &*ptr };
81+
Ok(ptr.clone())
82+
}
83+
None => Err(ParseError::at(
84+
start_text,
85+
format!("recursive grammar for `{}`", std::any::type_name::<T>()),
86+
)),
87+
}
88+
}
89+
}
90+
91+
pub fn enter<'s, 't, L, T>(
92+
scope: &'s Scope<L>,
93+
text: &'t str,
94+
mut op: impl FnMut() -> ParseResult<'t, T>,
95+
) -> ParseResult<'t, T>
96+
where
97+
L: Language,
98+
T: Debug + Clone + Eq + 'static,
99+
{
100+
tracing::trace!(
101+
"enter<{}>(scope={:?}, text={:?})",
102+
std::any::type_name::<T>(),
103+
scope,
104+
text
105+
);
106+
107+
// First check whether we are already parsing this same text in this same scope as this same type.
108+
let previous_result = STACK.with_borrow_mut(|stack| {
109+
if let Some(entry) = stack
110+
.iter_mut()
111+
.find(|entry| entry.matches::<L, T>(scope, text))
112+
{
113+
// UNSAFE: We need to justify that `entry.value` will be valid.
114+
//
115+
// Each entry in `stack` corresponds to an active stack frame `F` on this thread
116+
// and each entry in `stack` is only mutated by `F`
117+
//
118+
// The value in `entry.value` will either be `None` (in which case it is valid)
119+
// or `Some(p)` where `p` is a pointer.
120+
//
121+
// `p` will have been assigned by `F` just before invoking `op()`. It is a reference
122+
// to the last value in a vector owned by `F`. Since `F` is still active, that vector
123+
// is still valid. The borrow to produce `p` is valid (by inspection) because there are no
124+
// accesses to the vector until `op` completes
125+
// (and, to arrive at this code, `op` has not yet completed).
126+
unsafe {
127+
let result = entry.observe::<T>(text);
128+
tracing::trace!("found left-recursive stack entry, result = {:?}", result);
129+
Some(result)
130+
}
131+
} else {
132+
stack.push(StackEntry::new::<L, T>(scope, text));
133+
None
134+
}
135+
});
136+
if let Some(previous_result) = previous_result {
137+
return previous_result;
138+
}
139+
140+
// Access the top stack frame. Use a macro because we don't support closures
141+
// that are generic over the return type.
142+
macro_rules! with_top {
143+
(|$top:ident| $body:expr) => {
144+
STACK.with_borrow_mut(|stack| {
145+
let $top = stack.last_mut().unwrap();
146+
assert!($top.matches::<L, T>(scope, text));
147+
$body
148+
})
149+
};
150+
}
151+
152+
let pop_stack_before_return = |r: ParseResult<'t, T>| {
153+
STACK.with_borrow_mut(|stack| {
154+
let top = stack.pop().unwrap();
155+
assert!(top.matches::<L, T>(scope, text));
156+
});
157+
r
158+
};
159+
160+
// EXAMPLE: Consider this grammar
161+
//
162+
// ```
163+
// Expr = Expr '+' Expr
164+
// | Integer
165+
// ```
166+
//
167+
// and this input `2 + 3`. We process this in rounds.
168+
//
169+
// Round 0: Previous value `value` is `None`. When we go to parse expr, it will recurse,
170+
// which will yield an error that consumes zero tokens. We will then attempt integer,
171+
// which succeeds, yielding a parsed result of `2` with remainder `+ 3`.
172+
//
173+
// Round 1: We store `(2, "+ 3")` as the previous result and try again. When we go to parse `Expr`,
174+
// there are two options. First, we successfully parse as an integer just like before.
175+
// But also we are able to parse as `Expr + Expr`, because the left recursive reference to `Expr` yields `2`
176+
// and we can continue and parse `2 + 3`. The `Parser` prefers this longer result and so we get
177+
// `2 + 3` as the final result.
178+
//
179+
// Round 2: We store `(2+3, "")` as the previous result and try again. *This time* when we recurse,
180+
// we get `2` again! The reason why is a bit surprising. The parse of `2` succeeds with remainder
181+
// `"+ 3"`. But when we go parse `Expr + Expr`, the first `Expr` result yields `2 + 3` and there are no more
182+
// tokens, so that arm fails. In our loop below, we search back through the result and find that `2` has already
183+
// occurred, so we take `2 + 3` as the best overall parse.
184+
//
185+
// It's a bit subtle why this is ok. It's relying on some properties of grammars and parsing.
186+
// To be more obviously correct we would want to return sets of successful results.
187+
// In particular, the assumption is that `op` is always returning a best result (if any) and panicking on
188+
// ambiguity.
189+
190+
// First round parse is a bit special, because if we get an error here, we can just return immediately,
191+
// as there is no base case to build from.
192+
let mut values = vec![];
193+
match op() {
194+
Ok(v) => values.push(v),
195+
Err(errs) => return pop_stack_before_return(Err(errs)),
196+
};
197+
198+
// Check whether there was recursion to begin with.
199+
let observed = with_top!(|top| top.observed);
200+
if !observed {
201+
return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done.
202+
}
203+
204+
// OK, this is the interesting case. We may be able to get a better parse.
205+
loop {
206+
tracing::trace!(
207+
"reparsing of left-recursive grammar: values = {:#?}",
208+
values
209+
);
210+
211+
// If we have an intermediate value, update the stack entry to point at.
212+
// This takes a borrow of `value` but converts it into a raw pointer.
213+
// This borrow lasts until after `op` is complete.
214+
let best_value = values.last().unwrap();
215+
with_top!(|top| {
216+
top.value = Some(erase_type(best_value));
217+
});
218+
219+
// Invoke the operation. As noted above, if we get a failed parse NOW,
220+
// we know we already found the best result, so we can just use it.
221+
let Ok(value1) = op() else {
222+
return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done.
223+
};
224+
225+
tracing::trace!("left-recursive grammar yielded: value1 = {:?}", value1);
226+
227+
// If we got back on the previous results we saw, then we're entering
228+
// a loop and we can stop and take the best one (which should also be the longest).
229+
// In our example, this occurs when we parse `6` -- the first result
230+
// succeeds, but we have to try again to see if there's a more complex
231+
// expression that can be produced (there isn't).
232+
if values.iter().any(|v| *v == value1) {
233+
return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done.
234+
}
235+
236+
// Otherwise, we have to try again.
237+
values.push(value1);
238+
}
239+
}
240+
241+
fn erase_type<T>(s: &T) -> *const () {
242+
s as *const T as *const ()
243+
}

examples/formality-eg/grammar.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ pub enum Expr {
9797
#[precedence(1)]
9898
Div(Arc<Expr>, Arc<Expr>),
9999

100+
#[grammar(($v0))]
101+
Paren(Arc<Expr>),
102+
100103
#[grammar(let $v0 = $v1 in $v2)]
101104
LetIn(LocalVarId, Arc<Expr>, Arc<Expr>),
102105
}

examples/formality-eg/grammar/test.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use formality_core::test;
22

33
use crate::eg::term;
44

5-
use super::{StructDecl, Ty};
5+
use super::{Expr, StructDecl, Ty};
66

77
#[test]
88
fn test_struct_decl() {
@@ -30,6 +30,7 @@ fn test_struct_ty_no_args() {
3030
"#]]
3131
.assert_debug_eq(&r);
3232
}
33+
3334
#[test]
3435
fn test_vec_int_ty() {
3536
let r: Ty = term("Vec<integer>");
@@ -38,3 +39,12 @@ fn test_vec_int_ty() {
3839
"#]]
3940
.assert_debug_eq(&r);
4041
}
42+
43+
#[test]
44+
fn test_expression() {
45+
let r: Expr = term("3 + 5 * 6");
46+
expect_test::expect![[r#"
47+
3 + 5 * 6
48+
"#]]
49+
.assert_debug_eq(&r);
50+
}

0 commit comments

Comments
 (0)