diff --git a/cfgrammar/src/lib/yacc/ast.rs b/cfgrammar/src/lib/yacc/ast.rs index 849973c76..f8cb781f0 100644 --- a/cfgrammar/src/lib/yacc/ast.rs +++ b/cfgrammar/src/lib/yacc/ast.rs @@ -1,5 +1,6 @@ use std::{ collections::{HashMap, HashSet}, + error::Error, fmt, str::FromStr, }; @@ -15,8 +16,25 @@ use crate::{ Span, header::{GrmtoolsSectionParser, HeaderError, HeaderErrorKind, HeaderValue}, }; + +/// Any error from the Yacc parser returns an instance of this struct. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct ASTModificationError { + kind: YaccGrammarErrorKind, +} + +impl Error for ASTModificationError {} + +impl fmt::Display for ASTModificationError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.kind) + } +} + /// Contains a `GrammarAST` structure produced from a grammar source file. /// As well as any errors which occurred during the construction of the AST. +#[derive(Debug, Clone)] +#[cfg_attr(test, derive(PartialEq))] pub struct ASTWithValidityInfo { yacc_kind: YaccKind, ast: GrammarAST, @@ -70,6 +88,23 @@ impl ASTWithValidityInfo { pub fn errors(&self) -> &[YaccGrammarError] { self.errs.as_slice() } + + pub fn clone_and_change_start_rule(&self, rule: Rule) -> Result { + if self.ast.get_rule(&rule.name.0).is_some() { + let mut ret = self.clone(); + // The `Span`of the `start` field and the `name` field typically differ + // in that `start` is the parameter of a `%start` declaration, while + // `name` refers to the definition site of the rule itself. + // + // Lacking a better `Span` we use the definition site, for the `%start` rule here. + ret.ast.start = Some(rule.name); + Ok(ret) + } else { + Err(ASTModificationError { + kind: YaccGrammarErrorKind::InvalidStartRule(rule.name.0), + }) + } + } } impl FromStr for ASTWithValidityInfo { @@ -110,7 +145,8 @@ impl FromStr for ASTWithValidityInfo { /// An AST representing a grammar. This is built up gradually: when it is finished, the /// `complete_and_validate` must be called exactly once in order to finish the set-up. At that /// point, any further mutations made to the struct lead to undefined behaviour. -#[derive(Debug)] +#[derive(Debug, Clone)] +#[cfg_attr(test, derive(PartialEq))] #[non_exhaustive] pub struct GrammarAST { pub start: Option<(String, Span)>, @@ -140,14 +176,15 @@ pub struct GrammarAST { pub expect_unused: Vec, } -#[derive(Debug)] +#[derive(Debug, Clone)] +#[cfg_attr(test, derive(Eq, PartialEq))] pub struct Rule { pub name: (String, Span), pub pidxs: Vec, // index into GrammarAST.prod pub actiont: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] #[cfg_attr(test, derive(Eq, PartialEq))] pub struct Production { pub symbols: Vec, @@ -772,4 +809,30 @@ mod test { .contains(&ast_validity.ast().tokens.get_index_of("b").unwrap()) ); } + + #[test] + fn clone_ast_changing_start_rule() { + use super::*; + use crate::yacc::*; + let y_src = r#" + %start AStart + %token A B C + %% + AStart: A ':' BStart ';'; + BStart: B ',' C | C ',' B; + "#; + + let astart_ast_validity = + ASTWithValidityInfo::new(YaccKind::Original(YaccOriginalActionKind::NoAction), &y_src); + let bstart_rule = astart_ast_validity.ast().get_rule("BStart").unwrap(); + let bstart_ast_validity = astart_ast_validity + .clone_and_change_start_rule(bstart_rule.clone()) + .unwrap(); + assert!(astart_ast_validity.is_valid()); + assert!(bstart_ast_validity.is_valid()); + assert_eq!( + bstart_ast_validity.ast().start.as_ref(), + Some(&bstart_rule.name) + ); + } } diff --git a/cfgrammar/src/lib/yacc/mod.rs b/cfgrammar/src/lib/yacc/mod.rs index 68367525e..cdea4953a 100644 --- a/cfgrammar/src/lib/yacc/mod.rs +++ b/cfgrammar/src/lib/yacc/mod.rs @@ -17,7 +17,7 @@ use quote::quote; use serde::{Deserialize, Serialize}; /// The particular Yacc variant this grammar makes use of. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[non_exhaustive] pub enum YaccKind { @@ -43,7 +43,7 @@ impl quote::ToTokens for YaccKind { } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum YaccOriginalActionKind { /// Execute user-specified actions attached to each production; also requires a %actiontype diff --git a/lrpar/cttests/Cargo.toml b/lrpar/cttests/Cargo.toml index cbb23cba5..b157c3e20 100644 --- a/lrpar/cttests/Cargo.toml +++ b/lrpar/cttests/Cargo.toml @@ -12,7 +12,7 @@ crate-type = ["cdylib"] [build-dependencies] cfgrammar = { path = "../../cfgrammar" } lrlex = { path = "../../lrlex" } -lrpar = { path = "../" } +lrpar = { path = "../", features = ["_unstable_api"] } glob.workspace = true yaml-rust2.workspace = true cfg_aliases = "0.2.1" diff --git a/lrpar/cttests/build.rs b/lrpar/cttests/build.rs index 25e17be25..7d90f5805 100644 --- a/lrpar/cttests/build.rs +++ b/lrpar/cttests/build.rs @@ -1,3 +1,4 @@ +use cfgrammar::yacc::ast::ASTWithValidityInfo; use glob::glob; #[path = "src/cgen_helper.rs"] mod cgen_helper; @@ -23,27 +24,91 @@ fn main() -> Result<(), Box> { wasm32_unknown: { all(target_arch = "wasm32", target_os="unknown", target_vendor="unknown") }, } - // Because we're modifying the `StorageT` this isn't something `run_test_path` can do, - // Since it modifies the type of the builder. - CTLexerBuilder::>::new_with_lexemet() - .rust_edition(lrlex::RustEdition::Rust2021) - .output_path(format!( - "{}/storaget.l.rs", - std::env::var("OUT_DIR").unwrap() - )) - .lrpar_config(|ctp| { - ctp.rust_edition(lrpar::RustEdition::Rust2021) - .output_path(format!( - "{}/storaget.y.rs", - std::env::var("OUT_DIR").unwrap() - )) - .grammar_in_src_dir("storaget.y") - .unwrap() - }) - .lexer_in_src_dir("storaget.l") - .unwrap() - .build() - .unwrap(); + { + // Because we're modifying the `StorageT` this isn't something `run_test_path` can do, + // Since it modifies the type of the builder. + CTLexerBuilder::>::new_with_lexemet() + .rust_edition(lrlex::RustEdition::Rust2021) + .output_path(format!( + "{}/storaget.l.rs", + std::env::var("OUT_DIR").unwrap() + )) + .lrpar_config(|ctp| { + ctp.rust_edition(lrpar::RustEdition::Rust2021) + .output_path(format!( + "{}/storaget.y.rs", + std::env::var("OUT_DIR").unwrap() + )) + .grammar_in_src_dir("storaget.y") + .unwrap() + }) + .lexer_in_src_dir("storaget.l") + .unwrap() + .build() + .unwrap(); + } + + { + use lrpar::unstable_api::UnstableApi; + // In this case we'll be building multiple grammars + // + // 1. Parse multi_start_rule.y into an AST + // 2. Clone the original and change the start rule. + // 3. Build a grammar for `multi_start_rule.y` unchanged. + // 4. Build the modified grammar. + let grammar_path = &std::env::current_dir().unwrap().join("src/multi_start.y"); + let grammar_src = std::fs::read_to_string(grammar_path).unwrap(); + let grammar_src_clone = grammar_src.clone(); + let valid_ast = ASTWithValidityInfo::new(cfgrammar::yacc::YaccKind::Grmtools, &grammar_src); + eprintln!("rules {:?}", valid_ast.ast().rules); + let bstart_rule = valid_ast.ast().get_rule("BStart").unwrap().clone(); + let modified_ast = valid_ast.clone_and_change_start_rule(bstart_rule).unwrap(); + CTLexerBuilder::new() + .lrpar_config(move |ctp| { + ctp.grammar_ast(valid_ast.clone(), UnstableApi) + .with_grammar_src(grammar_src.clone(), UnstableApi) + .grammar_in_src_dir("multi_start.y") + .unwrap() + .mod_name("ast_unmodified_y") + .output_path(format!( + "{}/ast_unmodified.y.rs", + std::env::var("OUT_DIR").unwrap() + )) + }) + .lexer_in_src_dir("multi_start.l") + .unwrap() + .output_path(format!( + "{}/ast_unmodified.l.rs", + std::env::var("OUT_DIR").unwrap() + )) + .mod_name("ast_unmodified_l") + .build() + .unwrap(); + CTLexerBuilder::new() + .lrpar_config(move |ctp| { + ctp.grammar_ast(modified_ast.clone(), UnstableApi) + .with_grammar_src(grammar_src_clone.clone(), UnstableApi) + .grammar_in_src_dir("multi_start.y") + .unwrap() + .mod_name("ast_modified_y") + .output_path(format!( + "{}/ast_modified.y.rs", + std::env::var("OUT_DIR").unwrap() + )) + // We still need to disable these because they are checked after ast validation. + .warnings_are_errors(false) + .show_warnings(false) + }) + .lexer_in_src_dir("multi_start.l") + .unwrap() + .mod_name("ast_modified_l") + .output_path(format!( + "{}/ast_modified.l.rs", + std::env::var("OUT_DIR").unwrap() + )) + .build() + .unwrap(); + } println!("cargo::rerun-if-changed=src/storaget.l"); println!( "cargo::rerun-if-changed={}/storaget.l.rs", diff --git a/lrpar/cttests/src/lib.rs b/lrpar/cttests/src/lib.rs index df8f20bd7..2eb34a224 100644 --- a/lrpar/cttests/src/lib.rs +++ b/lrpar/cttests/src/lib.rs @@ -62,6 +62,12 @@ lrpar_mod!("storaget.y"); lrlex_mod!("grmtools_section.l"); lrpar_mod!("grmtools_section.y"); +lrlex_mod!("ast_unmodified.l"); +lrpar_mod!("ast_unmodified.y"); + +lrlex_mod!("ast_modified.l"); +lrpar_mod!("ast_modified.y"); + #[test] fn multitypes() { let lexerdef = multitypes_l::lexerdef(); @@ -423,6 +429,26 @@ fn test_lex_flags() { } } +#[test] +fn ast_unmodified() { + let lexerdef = ast_unmodified_l::lexerdef(); + let lexer = lexerdef.lexer("A: BBBB, CCCCC;"); + match &ast_unmodified_y::parse(&lexer) { + (_, errs) if errs.is_empty() => (), + (_, e) => panic!("{:?}", e), + } +} + +#[test] +fn ast_modified() { + let lexerdef = ast_modified_l::lexerdef(); + let lexer = lexerdef.lexer("CCCCC, BBBB"); + match &ast_modified_y::parse(&lexer) { + (_, errs) if errs.is_empty() => (), + (_, e) => panic!("{:?}", e), + } +} + // Codegen failure tests #[cfg(test)] generate_codegen_fail_tests!("src/ctfails/*.test"); diff --git a/lrpar/cttests/src/multi_start.l b/lrpar/cttests/src/multi_start.l new file mode 100644 index 000000000..29518b8a6 --- /dev/null +++ b/lrpar/cttests/src/multi_start.l @@ -0,0 +1,8 @@ +%% +A+ 'A' +B+ 'B' +C+ 'C' +; ';' +: ':' +, ',' +[ \n\t] ; \ No newline at end of file diff --git a/lrpar/cttests/src/multi_start.y b/lrpar/cttests/src/multi_start.y new file mode 100644 index 000000000..69ac59d8b --- /dev/null +++ b/lrpar/cttests/src/multi_start.y @@ -0,0 +1,13 @@ +%grmtools{yacckind: Grmtools} +%start AStart +%token A B C +%% + +AStart -> () + : A ':' BStart ';' {()} + ; + +BStart -> () + : B ',' C {()} + | C ',' B {()} + ; \ No newline at end of file diff --git a/lrpar/src/lib/ctbuilder.rs b/lrpar/src/lib/ctbuilder.rs index 234815b20..19e287c29 100644 --- a/lrpar/src/lib/ctbuilder.rs +++ b/lrpar/src/lib/ctbuilder.rs @@ -18,6 +18,10 @@ use crate::{ LexerTypes, RTParserBuilder, RecoveryKind, diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter}, }; + +#[cfg(feature = "_unstable_api")] +use crate::unstable_api::UnstableApi; + use bincode::{Decode, Encode, decode_from_slice, encode_to_vec}; use cfgrammar::{ Location, RIdx, Symbol, @@ -230,6 +234,10 @@ where // certainly needs to be included as part of the rebuild_cache function below so that, if it's // changed, the grammar is rebuilt. grammar_path: Option, + // If specified rather than reading source from `grammar_path`, use this string directly + grammar_src: Option, + // If specified along with `grammar_src`, use this rather than building an ast from `grammar_src`. + from_ast: Option, output_path: Option, mod_name: Option<&'a str>, recoverer: Option, @@ -287,6 +295,8 @@ where pub fn new() -> Self { CTParserBuilder { grammar_path: None, + grammar_src: None, + from_ast: None, output_path: None, mod_name: None, recoverer: None, @@ -352,6 +362,14 @@ where Ok(self.output_path(outp)) } + /// If set, specifies that this grammar should be built from a pre-validated AST + /// instead of a `.y`` file. When this is specified, `grammar_path` will not be read. + #[cfg(feature = "_unstable_api")] + pub fn grammar_ast(mut self, valid_ast: ASTWithValidityInfo, _api_key: UnstableApi) -> Self { + self.from_ast = Some(valid_ast); + self + } + /// Set the input grammar path to `inp`. If specified, you must also call /// [CTParserBuilder::output_path]. In general it is easier to use /// [CTParserBuilder::grammar_in_src_dir]. @@ -363,6 +381,12 @@ where self } + #[cfg(feature = "_unstable_api")] + pub fn with_grammar_src(mut self, src: String, _api_key: UnstableApi) -> Self { + self.grammar_src = Some(src); + self + } + /// Set the output grammar path to `outp`. Note that there are no requirements on `outp`: the /// file can exist anywhere you can create a valid [Path] to. However, if you wish to use /// [crate::lrpar_mod!] you will need to make sure that `outp` is in @@ -556,8 +580,12 @@ where lk.insert(outp.clone()); } - let inc = - read_to_string(grmp).map_err(|e| format!("When reading '{}': {e}", grmp.display()))?; + let inc = if let Some(grammar_src) = &self.grammar_src { + grammar_src.clone() + } else { + read_to_string(grmp).map_err(|e| format!("When reading '{}': {e}", grmp.display()))? + }; + let yacc_diag = SpannedDiagnosticFormatter::new(&inc, grmp); let parsed_header = GrmtoolsSectionParser::new(&inc, false).parse(); if let Err(errs) = parsed_header { @@ -579,7 +607,9 @@ where .map(YaccKind::try_from) .transpose()?; header.mark_used(&"yacckind".to_string()); - let ast_validation = if let Some(yk) = self.yacckind { + let ast_validation = if let Some(ast) = &self.from_ast { + ast.clone() + } else if let Some(yk) = self.yacckind { ASTWithValidityInfo::new(yk, &inc) } else { Err("Missing 'yacckind'".to_string())? @@ -884,6 +914,8 @@ where self.output_path = Some(outp.as_ref().to_owned()); let cl: CTParserBuilder = CTParserBuilder { grammar_path: self.grammar_path.clone(), + grammar_src: None, + from_ast: None, output_path: self.output_path.clone(), mod_name: self.mod_name, recoverer: self.recoverer, @@ -975,6 +1007,10 @@ where // All variables except for `output_path`, `inspect_callback` and `phantom` should // be written into the cache. grammar_path, + // I struggle to imagine the correct thing for `grammar_src`. + grammar_src: _, + // I struggle to imagine the correct thing for `from_ast`. + from_ast: _, mod_name, recoverer, yacckind,