diff --git a/cfgrammar/src/lib/yacc/grammar.rs b/cfgrammar/src/lib/yacc/grammar.rs index fe9def8a2..5f43470f2 100644 --- a/cfgrammar/src/lib/yacc/grammar.rs +++ b/cfgrammar/src/lib/yacc/grammar.rs @@ -644,6 +644,16 @@ where m } + pub fn tokens_map_iter(&self) -> impl Iterator)> { + self.iter_tidxs().filter_map(|tidx| { + if let Some((_, name)) = self.token_names[usize::from(tidx)].as_ref() { + Some((name.as_str(), tidx)) + } else { + None + } + }) + } + /// Return the index of the token named `n` or `None` if it doesn't exist. pub fn token_idx(&self, n: &str) -> Option> { self.token_names diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs index 835eebcf3..bf6dda331 100644 --- a/lrlex/src/lib/ctbuilder.rs +++ b/lrlex/src/lib/ctbuilder.rs @@ -2,6 +2,7 @@ use std::{ any::type_name, + borrow::Borrow, collections::{HashMap, HashSet}, env::{current_dir, var}, error::Error, @@ -446,7 +447,7 @@ where /// * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the /// module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of /// `_l`). - pub fn build(mut self) -> Result> { + pub fn build(self) -> Result> { let lexerp = self .lexer_path .as_ref() @@ -521,15 +522,15 @@ where } }; - if let Some(ref lrcfg) = self.lrpar_config { - let mut lexerdef = lexerdef.clone(); + let ct_parser = if let Some(ref lrcfg) = self.lrpar_config { + let mut closure_lexerdef = lexerdef.clone(); let mut ctp = CTParserBuilder::::new().inspect_rt(Box::new( move |yacc_header, rtpb, rule_ids_map, grm_path| { let owned_map = rule_ids_map .iter() .map(|(x, y)| (&**x, *y)) .collect::>(); - lexerdef.set_rule_ids(&owned_map); + closure_lexerdef.set_rule_ids(&owned_map); yacc_header.mark_used(&"test_files".to_string()); let test_glob = yacc_header.get("test_files"); match test_glob { @@ -540,7 +541,8 @@ where { let path = path?; let input = fs::read_to_string(&path)?; - let l: LRNonStreamingLexer = lexerdef.lexer(&input); + let l: LRNonStreamingLexer = + closure_lexerdef.lexer(&input); for e in rtpb.parse_noaction(&l) { Err(format!("parsing {}: {}", path.display(), e))? } @@ -553,9 +555,11 @@ where }, )); ctp = lrcfg(ctp); - let map = ctp.build()?; - self.rule_ids_map = Some(map.token_map().to_owned()); - } + let ct_parser = ctp.build()?; + Some(ct_parser) + } else { + None + }; let mut lexerdef = Box::new(lexerdef); let unused_header_values = header.unused(); @@ -566,36 +570,84 @@ where } let (missing_from_lexer, missing_from_parser) = match self.rule_ids_map { - Some(ref rim) => { - // Convert from HashMap to HashMap<&str, _> - let owned_map = rim - .iter() - .map(|(x, y)| (&**x, *y)) - .collect::>(); - let (x, y) = lexerdef.set_rule_ids(&owned_map); - ( - x.map(|a| a.iter().map(|&b| b.to_string()).collect::>()), - y.map(|a| a.iter().map(|&b| b.to_string()).collect::>()), - ) - } - None => (None, None), + Some(ref rim) => lexerdef + .set_rule_ids_spanned_iter(rim.iter().map(|(name, tidx)| (name.as_str(), *tidx))), + None => match &ct_parser { + Some(ct_parser) => lexerdef.set_rule_ids_spanned_iter( + ct_parser.yacc_grammar().iter_tidxs().filter_map(|tidx| { + ct_parser + .yacc_grammar() + .token_name(tidx) + .map(|n| (n, tidx.as_storaget())) + }), + ), + None => (None, None), + }, }; - let mut has_unallowed_missing = false; + let err_indent = " ".repeat(ERROR.len()); if !self.allow_missing_terms_in_lexer { if let Some(ref mfl) = missing_from_lexer { - eprintln!("Error: the following tokens are used in the grammar but are not defined in the lexer:"); - for n in mfl { - eprintln!(" {}", n); + if let Some(ct_parser) = &ct_parser { + let grm = ct_parser.yacc_grammar(); + let token_spans = mfl + .iter() + .map(|name| { + ct_parser + .yacc_grammar() + .token_span(*grm.tokens_map().get(name).unwrap()) + .expect("Given token should have a span") + }) + .collect::>(); + + let yacc_diag = SpannedDiagnosticFormatter::new( + ct_parser.grammar_src(), + ct_parser.grammar_path(), + ); + + eprintln!("{ERROR} these tokens are not referenced in the lexer but defined as follows"); + eprintln!( + "{err_indent} {}", + yacc_diag.file_location_msg("in the grammar", None) + ); + for span in token_spans { + eprintln!( + "{}", + yacc_diag.underline_span_with_text( + span, + "Missing from lexer".to_string(), + '^' + ) + ); + } + eprintln!(); + } else { + eprintln!("{ERROR} the following tokens are used in the grammar but are not defined in the lexer:"); + for n in mfl { + eprintln!(" {}", n); + } } has_unallowed_missing = true; } } if !self.allow_missing_tokens_in_parser { if let Some(ref mfp) = missing_from_parser { - eprintln!("Error: the following tokens are defined in the lexer but not used in the grammar:"); - for n in mfp { - eprintln!(" {}", n); + eprintln!( + "{ERROR} these tokens are not referenced in the grammar but defined as follows" + ); + eprintln!( + "{err_indent} {}", + lex_diag.file_location_msg("in the lexer", None) + ); + for (_, span) in mfp { + eprintln!( + "{}", + lex_diag.underline_span_with_text( + *span, + "Missing from parser".to_string(), + '^' + ) + ); } has_unallowed_missing = true; } @@ -748,90 +800,12 @@ where // binary etc). if let Ok(curs) = read_to_string(outp) { if curs == outs { - return Ok(CTLexer { - missing_from_lexer, - missing_from_parser, - }); + return Ok(CTLexer); } } let mut f = File::create(outp)?; f.write_all(outs.as_bytes())?; - Ok(CTLexer { - missing_from_lexer, - missing_from_parser, - }) - } - - /// Given the filename `a/b.l` as input, statically compile the file `src/a/b.l` into a Rust - /// module which can then be imported using `lrlex_mod!("a/b.l")`. This is a convenience - /// function around [`process_file`](struct.CTLexerBuilder.html#method.process_file) which makes - /// it easier to compile `.l` files stored in a project's `src/` directory: please see - /// [`process_file`](#method.process_file) for additional constraints and information about the - /// generated files. - #[deprecated( - since = "0.11.0", - note = "Please use lexer_in_src_dir() and build() instead" - )] - #[allow(deprecated)] - pub fn process_file_in_src( - self, - srcp: &str, - ) -> Result<(Option>, Option>), Box> { - let mut inp = current_dir()?; - inp.push("src"); - inp.push(srcp); - let mut outp = PathBuf::new(); - outp.push(var("OUT_DIR").unwrap()); - outp.push(Path::new(srcp).parent().unwrap().to_str().unwrap()); - create_dir_all(&outp)?; - let mut leaf = Path::new(srcp) - .file_name() - .unwrap() - .to_str() - .unwrap() - .to_owned(); - write!(leaf, ".{}", RUST_FILE_EXT).ok(); - outp.push(leaf); - self.process_file(inp, outp) - } - - /// Statically compile the `.l` file `inp` into Rust, placing the output into the file `outp`. - /// The latter defines a module as follows: - /// - /// ```text - /// mod modname { - /// pub fn lexerdef() -> LexerDef { ... } - /// - /// ... - /// } - /// ``` - /// - /// where: - /// * `modname` is either: - /// * the module name specified [`mod_name`](#method.mod_name) - /// * or, if no module name was explicitly specified, then for the file `/a/b/c.l` the - /// module name is `c_l` (i.e. the file's leaf name, minus its extension, with a prefix of - /// `_l`). - #[deprecated( - since = "0.11.0", - note = "Please use lexer_in_src_dir() and build() instead" - )] - pub fn process_file( - mut self, - inp: P, - outp: Q, - ) -> Result<(Option>, Option>), Box> - where - P: AsRef, - Q: AsRef, - { - self.lexer_path = Some(inp.as_ref().to_owned()); - self.output_path = Some(outp.as_ref().to_owned()); - let cl = self.build()?; - Ok(( - cl.missing_from_lexer().map(|x| x.to_owned()), - cl.missing_from_parser().map(|x| x.to_owned()), - )) + Ok(CTLexer) } /// If passed false, tokens used in the grammar but not defined in the lexer will cause a @@ -1064,20 +1038,7 @@ where } /// An interface to the result of [CTLexerBuilder::build()]. -pub struct CTLexer { - missing_from_lexer: Option>, - missing_from_parser: Option>, -} - -impl CTLexer { - fn missing_from_lexer(&self) -> Option<&HashSet> { - self.missing_from_lexer.as_ref() - } - - fn missing_from_parser(&self) -> Option<&HashSet> { - self.missing_from_parser.as_ref() - } -} +pub struct CTLexer; /// Create a Rust module named `mod_name` that can be imported with /// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per @@ -1105,7 +1066,7 @@ impl CTLexer { /// ``` pub fn ct_token_map( mod_name: &str, - token_map: &HashMap, + token_map: impl Borrow>, rename_map: Option<&HashMap<&str, &str>>, ) -> Result<(), Box> { // Record the time that this version of lrlex was built. If the source code changes and rustc @@ -1122,6 +1083,7 @@ pub fn ct_token_map( .ok(); outs.push_str( &token_map + .borrow() .iter() .map(|(k, v)| { let k = match rename_map { diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs index a62e42307..c856caf56 100644 --- a/lrlex/src/lib/lexer.rs +++ b/lrlex/src/lib/lexer.rs @@ -374,11 +374,20 @@ where rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>, ) -> (Option>, Option>); + /// Like `set_rule_ids` but also returns a `Span` for missing lex rules. fn set_rule_ids_spanned<'a>( &'a mut self, rule_ids_map: &HashMap<&'a str, LexerTypesT::StorageT>, ) -> (Option>, Option>); + /// Like `set_rule_ids_spanned` but takes + fn set_rule_ids_spanned_iter<'a, I>( + &'a mut self, + rule_ids_map: I, + ) -> (Option>, Option>) + where + I: IntoIterator; + /// Returns an iterator over all rules in this AST. fn iter_rules(&self) -> Iter>; @@ -517,6 +526,71 @@ where (missing_from_lexer, missing_from_parser) } + /// Like `set_rule_ids_spanned` but takes an iterator. + fn set_rule_ids_spanned_iter<'a, I>( + &'a mut self, + rule_ids_iter: I, + ) -> (Option>, Option>) + where + I: IntoIterator, + { + let rule_ids_map: HashMap<&str, LexerTypesT::StorageT> = + HashMap::from_iter(rule_ids_iter.into_iter()); + // Because we have to iter_mut over self.rules, we can't easily store a reference to the + // rule's name at the same time. Instead, we store the index of each such rule and + // recover the names later. This has the unfortunate consequence of extended the mutable + // borrow for the rest of the 'a lifetime. To avoid that we could return idx's here. + // But the original `set_rule_ids` invalidates indexes. In the spirit of keeping that + // behavior consistent, this also returns the span. + let mut missing_from_parser_idxs = Vec::new(); + let mut rules_with_names = 0; + for (i, r) in self.rules.iter_mut().enumerate() { + if let Some(n) = r.name() { + match rule_ids_map.get(n) { + Some(tok_id) => r.tok_id = Some(*tok_id), + None => { + r.tok_id = None; + missing_from_parser_idxs.push(i); + } + } + rules_with_names += 1; + } + } + + let missing_from_parser = if missing_from_parser_idxs.is_empty() { + None + } else { + let mut mfp = HashSet::with_capacity(missing_from_parser_idxs.len()); + for i in &missing_from_parser_idxs { + mfp.insert((self.rules[*i].name().unwrap(), self.rules[*i].name_span())); + } + Some(mfp) + }; + + let missing_from_lexer = + if rules_with_names - missing_from_parser_idxs.len() == rule_ids_map.len() { + None + } else { + Some( + rule_ids_map + .keys() + .cloned() + .collect::>() + .difference( + &self + .rules + .iter() + .filter_map(|x| x.name()) + .collect::>(), + ) + .cloned() + .collect::>(), + ) + }; + + (missing_from_lexer, missing_from_parser) + } + fn iter_rules(&self) -> Iter> { self.rules.iter() } diff --git a/lrpar/src/lib/ctbuilder.rs b/lrpar/src/lib/ctbuilder.rs index 52e4135fb..441dff45c 100644 --- a/lrpar/src/lib/ctbuilder.rs +++ b/lrpar/src/lib/ctbuilder.rs @@ -691,7 +691,9 @@ where if outc.contains(&cache.to_string()) { return Ok(CTParser { regenerated: false, - rule_ids, + yacc_grammar: grm, + grammar_src: inc, + grammar_path: self.grammar_path.unwrap(), conflicts: None, }); } else { @@ -779,13 +781,15 @@ where &format!("/* CACHE INFORMATION {} */\n", cache), )?; let conflicts = if stable.conflicts().is_some() { - Some((grm, sgraph, stable)) + Some((sgraph, stable)) } else { None }; Ok(CTParser { regenerated: true, - rule_ids, + yacc_grammar: grm, + grammar_src: inc, + grammar_path: self.grammar_path.unwrap(), conflicts, }) } @@ -890,7 +894,7 @@ where inspect_callback: None, phantom: PhantomData, }; - Ok(cl.build()?.rule_ids) + Ok(*cl.build()?.token_map()) } fn output_file>( @@ -1488,12 +1492,10 @@ where StorageT: Eq + Hash, { regenerated: bool, - rule_ids: HashMap, - conflicts: Option<( - YaccGrammar, - StateGraph, - StateTable, - )>, + yacc_grammar: YaccGrammar, + grammar_src: String, + grammar_path: PathBuf, + conflicts: Option<(StateGraph, StateTable)>, } impl CTParser @@ -1508,8 +1510,14 @@ where /// Returns a [HashMap] from lexeme string types to numeric types (e.g. `INT: 2`), suitable for /// handing to a lexer to coordinate the IDs of lexer and parser. - pub fn token_map(&self) -> &HashMap { - &self.rule_ids + pub fn token_map(&self) -> Box> { + Box::new( + self.yacc_grammar + .tokens_map() + .iter() + .map(|(name, tidx)| (name.to_string(), tidx.as_storaget())) + .collect::>(), + ) } /// If there are any conflicts in the grammar, return a tuple which allows users to inspect and @@ -1527,11 +1535,29 @@ where &StateTable, &Conflicts, )> { - if let Some((grm, sgraph, stable)) = &self.conflicts { - return Some((grm, sgraph, stable, stable.conflicts().unwrap())); + if let Some((sgraph, stable)) = &self.conflicts { + return Some(( + &self.yacc_grammar, + sgraph, + stable, + stable.conflicts().unwrap(), + )); } None } + + #[doc(hidden)] + pub fn yacc_grammar(&self) -> &YaccGrammar { + &self.yacc_grammar + } + #[doc(hidden)] + pub fn grammar_src(&self) -> &str { + &self.grammar_src + } + #[doc(hidden)] + pub fn grammar_path(&self) -> &Path { + self.grammar_path.as_path() + } } /// Indents a multi-line string and trims any trailing newline.