|
| 1 | +//! Support left-recursive grammars. This is basically just a fixed point |
| 2 | +//! operation, but we re-implement it to avoid having to return multiple |
| 3 | +//! success values, since we know that's not really needed here. |
| 4 | +//! |
| 5 | +//! This unfortunately requires unsafe and even `type_id`. This is because |
| 6 | +//! we need to be generic over the language `L` and the result type |
| 7 | +//! `T` in our `thread_local!` and you can't have generic thread-local values. |
| 8 | +//! So we have to erase types. Annoying! |
| 9 | +
|
| 10 | +use std::{any::TypeId, cell::RefCell, fmt::Debug}; |
| 11 | + |
| 12 | +use crate::{ |
| 13 | + language::Language, |
| 14 | + parse::{ParseError, ParseResult, Scope, SuccessfulParse}, |
| 15 | +}; |
| 16 | + |
| 17 | +thread_local! { |
| 18 | + static STACK: RefCell<Vec<StackEntry>> = Default::default() |
| 19 | +} |
| 20 | + |
| 21 | +/// Tracks an active parse that is taking place. |
| 22 | +struct StackEntry { |
| 23 | + /// The scope pointer: we use `()` instead of `Scope<L>` |
| 24 | + scope: *const (), |
| 25 | + |
| 26 | + /// The starting text: we use `*const` instead of `&'t str` |
| 27 | + start_text: *const str, |
| 28 | + |
| 29 | + /// The TypeId of the type `T`. |
| 30 | + type_id: TypeId, |
| 31 | + |
| 32 | + /// The intermediate value produced. If `Some`, this is a pointer |
| 33 | + /// to a `SuccessfulParse<'t, T>`. |
| 34 | + value: Option<*const ()>, |
| 35 | + |
| 36 | + /// |
| 37 | + observed: bool, |
| 38 | +} |
| 39 | + |
| 40 | +impl StackEntry { |
| 41 | + pub fn new<L, T>(scope: &Scope<L>, start_text: &str) -> Self |
| 42 | + where |
| 43 | + L: Language, |
| 44 | + T: Clone + 'static, |
| 45 | + { |
| 46 | + Self { |
| 47 | + scope: erase_type(scope), |
| 48 | + start_text, |
| 49 | + type_id: TypeId::of::<T>(), |
| 50 | + value: None, |
| 51 | + observed: false, |
| 52 | + } |
| 53 | + } |
| 54 | + |
| 55 | + pub fn matches<L, T>(&self, scope: &Scope<L>, start_text: &str) -> bool |
| 56 | + where |
| 57 | + L: Language, |
| 58 | + T: Clone + 'static, |
| 59 | + { |
| 60 | + let scope: *const () = erase_type(scope); |
| 61 | + let start_text: *const str = start_text; |
| 62 | + let type_id = TypeId::of::<T>(); |
| 63 | + scope == self.scope && start_text == self.start_text && self.type_id == type_id |
| 64 | + } |
| 65 | + |
| 66 | + /// UNSAFE: Caller must guarantee that `self.value` pointer is valid. |
| 67 | + pub unsafe fn observe<'t, T>(&mut self, start_text: &'t str) -> ParseResult<'t, T> |
| 68 | + where |
| 69 | + T: Clone + 'static, |
| 70 | + { |
| 71 | + assert_eq!(self.start_text, start_text as *const str); |
| 72 | + assert_eq!(self.type_id, TypeId::of::<T>()); |
| 73 | + |
| 74 | + self.observed = true; |
| 75 | + |
| 76 | + match self.value { |
| 77 | + Some(ptr) => { |
| 78 | + let ptr = ptr as *const SuccessfulParse<'t, T>; |
| 79 | + // UNSAFE: We rely on the caller to entry ptr is valid. |
| 80 | + let ptr = unsafe { &*ptr }; |
| 81 | + Ok(ptr.clone()) |
| 82 | + } |
| 83 | + None => Err(ParseError::at( |
| 84 | + start_text, |
| 85 | + format!("recursive grammar for `{}`", std::any::type_name::<T>()), |
| 86 | + )), |
| 87 | + } |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +pub fn enter<'s, 't, L, T>( |
| 92 | + scope: &'s Scope<L>, |
| 93 | + text: &'t str, |
| 94 | + mut op: impl FnMut() -> ParseResult<'t, T>, |
| 95 | +) -> ParseResult<'t, T> |
| 96 | +where |
| 97 | + L: Language, |
| 98 | + T: Debug + Clone + Eq + 'static, |
| 99 | +{ |
| 100 | + tracing::trace!( |
| 101 | + "enter<{}>(scope={:?}, text={:?})", |
| 102 | + std::any::type_name::<T>(), |
| 103 | + scope, |
| 104 | + text |
| 105 | + ); |
| 106 | + |
| 107 | + // First check whether we are already parsing this same text in this same scope as this same type. |
| 108 | + let previous_result = STACK.with_borrow_mut(|stack| { |
| 109 | + if let Some(entry) = stack |
| 110 | + .iter_mut() |
| 111 | + .find(|entry| entry.matches::<L, T>(scope, text)) |
| 112 | + { |
| 113 | + // UNSAFE: We need to justify that `entry.value` will be valid. |
| 114 | + // |
| 115 | + // Each entry in `stack` corresponds to an active stack frame `F` on this thread |
| 116 | + // and each entry in `stack` is only mutated by `F` |
| 117 | + // |
| 118 | + // The value in `entry.value` will either be `None` (in which case it is valid) |
| 119 | + // or `Some(p)` where `p` is a pointer. |
| 120 | + // |
| 121 | + // `p` will have been assigned by `F` just before invoking `op()`. It is a reference |
| 122 | + // to the last value in a vector owned by `F`. Since `F` is still active, that vector |
| 123 | + // is still valid. The borrow to produce `p` is valid (by inspection) because there are no |
| 124 | + // accesses to the vector until `op` completes |
| 125 | + // (and, to arrive at this code, `op` has not yet completed). |
| 126 | + unsafe { |
| 127 | + let result = entry.observe::<T>(text); |
| 128 | + tracing::trace!("found left-recursive stack entry, result = {:?}", result); |
| 129 | + Some(result) |
| 130 | + } |
| 131 | + } else { |
| 132 | + stack.push(StackEntry::new::<L, T>(scope, text)); |
| 133 | + None |
| 134 | + } |
| 135 | + }); |
| 136 | + if let Some(previous_result) = previous_result { |
| 137 | + return previous_result; |
| 138 | + } |
| 139 | + |
| 140 | + // Access the top stack frame. Use a macro because we don't support closures |
| 141 | + // that are generic over the return type. |
| 142 | + macro_rules! with_top { |
| 143 | + (|$top:ident| $body:expr) => { |
| 144 | + STACK.with_borrow_mut(|stack| { |
| 145 | + let $top = stack.last_mut().unwrap(); |
| 146 | + assert!($top.matches::<L, T>(scope, text)); |
| 147 | + $body |
| 148 | + }) |
| 149 | + }; |
| 150 | + } |
| 151 | + |
| 152 | + let pop_stack_before_return = |r: ParseResult<'t, T>| { |
| 153 | + STACK.with_borrow_mut(|stack| { |
| 154 | + let top = stack.pop().unwrap(); |
| 155 | + assert!(top.matches::<L, T>(scope, text)); |
| 156 | + }); |
| 157 | + r |
| 158 | + }; |
| 159 | + |
| 160 | + // EXAMPLE: Consider this grammar |
| 161 | + // |
| 162 | + // ``` |
| 163 | + // Expr = Expr '+' Expr |
| 164 | + // | Integer |
| 165 | + // ``` |
| 166 | + // |
| 167 | + // and this input `2 + 3`. We process this in rounds. |
| 168 | + // |
| 169 | + // Round 0: Previous value `value` is `None`. When we go to parse expr, it will recurse, |
| 170 | + // which will yield an error that consumes zero tokens. We will then attempt integer, |
| 171 | + // which succeeds, yielding a parsed result of `2` with remainder `+ 3`. |
| 172 | + // |
| 173 | + // Round 1: We store `(2, "+ 3")` as the previous result and try again. When we go to parse `Expr`, |
| 174 | + // there are two options. First, we successfully parse as an integer just like before. |
| 175 | + // But also we are able to parse as `Expr + Expr`, because the left recursive reference to `Expr` yields `2` |
| 176 | + // and we can continue and parse `2 + 3`. The `Parser` prefers this longer result and so we get |
| 177 | + // `2 + 3` as the final result. |
| 178 | + // |
| 179 | + // Round 2: We store `(2+3, "")` as the previous result and try again. *This time* when we recurse, |
| 180 | + // we get `2` again! The reason why is a bit surprising. The parse of `2` succeeds with remainder |
| 181 | + // `"+ 3"`. But when we go parse `Expr + Expr`, the first `Expr` result yields `2 + 3` and there are no more |
| 182 | + // tokens, so that arm fails. In our loop below, we search back through the result and find that `2` has already |
| 183 | + // occurred, so we take `2 + 3` as the best overall parse. |
| 184 | + // |
| 185 | + // It's a bit subtle why this is ok. It's relying on some properties of grammars and parsing. |
| 186 | + // To be more obviously correct we would want to return sets of successful results. |
| 187 | + // In particular, the assumption is that `op` is always returning a best result (if any) and panicking on |
| 188 | + // ambiguity. |
| 189 | + |
| 190 | + // First round parse is a bit special, because if we get an error here, we can just return immediately, |
| 191 | + // as there is no base case to build from. |
| 192 | + let mut values = vec![]; |
| 193 | + match op() { |
| 194 | + Ok(v) => values.push(v), |
| 195 | + Err(errs) => return pop_stack_before_return(Err(errs)), |
| 196 | + }; |
| 197 | + |
| 198 | + // Check whether there was recursion to begin with. |
| 199 | + let observed = with_top!(|top| top.observed); |
| 200 | + if !observed { |
| 201 | + return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done. |
| 202 | + } |
| 203 | + |
| 204 | + // OK, this is the interesting case. We may be able to get a better parse. |
| 205 | + loop { |
| 206 | + tracing::trace!( |
| 207 | + "reparsing of left-recursive grammar: values = {:#?}", |
| 208 | + values |
| 209 | + ); |
| 210 | + |
| 211 | + // If we have an intermediate value, update the stack entry to point at. |
| 212 | + // This takes a borrow of `value` but converts it into a raw pointer. |
| 213 | + // This borrow lasts until after `op` is complete. |
| 214 | + let best_value = values.last().unwrap(); |
| 215 | + with_top!(|top| { |
| 216 | + top.value = Some(erase_type(best_value)); |
| 217 | + }); |
| 218 | + |
| 219 | + // Invoke the operation. As noted above, if we get a failed parse NOW, |
| 220 | + // we know we already found the best result, so we can just use it. |
| 221 | + let Ok(value1) = op() else { |
| 222 | + return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done. |
| 223 | + }; |
| 224 | + |
| 225 | + tracing::trace!("left-recursive grammar yielded: value1 = {:?}", value1); |
| 226 | + |
| 227 | + // If we got back on the previous results we saw, then we're entering |
| 228 | + // a loop and we can stop and take the best one (which should also be the longest). |
| 229 | + // In our example, this occurs when we parse `6` -- the first result |
| 230 | + // succeeds, but we have to try again to see if there's a more complex |
| 231 | + // expression that can be produced (there isn't). |
| 232 | + if values.iter().any(|v| *v == value1) { |
| 233 | + return pop_stack_before_return(Ok(values.pop().unwrap())); // If not, we are done. |
| 234 | + } |
| 235 | + |
| 236 | + // Otherwise, we have to try again. |
| 237 | + values.push(value1); |
| 238 | + } |
| 239 | +} |
| 240 | + |
| 241 | +fn erase_type<T>(s: &T) -> *const () { |
| 242 | + s as *const T as *const () |
| 243 | +} |
0 commit comments