diff --git a/doc/src/manuallexer.md b/doc/src/manuallexer.md index 9f29bf9b8..95e56df2c 100644 --- a/doc/src/manuallexer.md +++ b/doc/src/manuallexer.md @@ -28,17 +28,17 @@ and the boiler-plate that comes with it unwanted. Fortunately, `lrlex` provides [`lrpar::Lexeme`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.Lexeme.html) trait. - 3. `lrlex` exposes a - [`ct_token_map`](https://softdevteam.github.io/grmtools/master/api/lrlex/fn.ct_token_map.html) - function to be used from `build.rs` scripts which automatically produces a - Rust module with one constant per token ID. `ct_token_map` is explicitly + 3. `lrlex` exposes + [`CTTokenMapBuilder`](https://softdevteam.github.io/grmtools/master/api/lrlex/struct.CTTokenMapBuilder.html) + to be used from `build.rs` scripts which automatically produces a + Rust module with one constant per token ID. It is explicitly designed to be easy to use with `lrpar`'s compile-time building. Putting these together is then relatively easy. First a `build.rs` file for a hand-written lexer will look roughly as follows: ```rust -use lrlex::{ct_token_map, DefaultLexerTypes}; +use lrlex::{CTTokenMapBuilder, DefaultLexerTypes}; use lrpar::CTParserBuilder; fn main() { @@ -47,7 +47,7 @@ fn main() { .unwrap() .build() .unwrap(); - ct_token_map::("token_map", ctp.token_map(), None).unwrap() + CTTokenMapBuilder::::new("token_map", ctp.token_map()).build().unwrap() } ``` @@ -65,7 +65,7 @@ Expr -> Result: the module will contain `const T_PLUS: u8 = ...;`. Since Yacc grammars can contain token identifiers which are not valid Rust -identifiers, `ct_token_map` allows you to provide a map from the token +identifiers, `CTTokenMapBuilder` allows you to provide a map from the token identifier to a "Rust friendly" variant. For example, for the following grammar excerpt: diff --git a/lrlex/examples/calc_manual_lex/build.rs b/lrlex/examples/calc_manual_lex/build.rs index 588176444..ca884840f 100644 --- a/lrlex/examples/calc_manual_lex/build.rs +++ b/lrlex/examples/calc_manual_lex/build.rs @@ -1,4 +1,4 @@ -use lrlex::{DefaultLexerTypes, ct_token_map}; +use lrlex::{CTTokenMapBuilder, DefaultLexerTypes}; use lrpar::CTParserBuilder; // Some of the token names in the parser do not lead to valid Rust identifiers, so we map them to @@ -16,10 +16,8 @@ fn main() { .unwrap() .build() .unwrap(); - ct_token_map::( - "token_map", - ctp.token_map(), - Some(&TOKENS_MAP.iter().cloned().collect()), - ) - .unwrap(); + CTTokenMapBuilder::::new("token_map", ctp.token_map()) + .rename_map(Some(TOKENS_MAP)) + .build() + .unwrap(); } diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs index 194dba24b..1dc45f549 100644 --- a/lrlex/src/lib/ctbuilder.rs +++ b/lrlex/src/lib/ctbuilder.rs @@ -1,19 +1,5 @@ //! Build grammars at run-time. -use std::{ - any::type_name, - borrow::Borrow, - collections::{HashMap, HashSet}, - env::{current_dir, var}, - error::Error, - fmt::{self, Debug, Display, Write as _}, - fs::{self, File, create_dir_all, read_to_string}, - hash::Hash, - io::Write, - path::{Path, PathBuf}, - sync::{LazyLock, Mutex}, -}; - use bincode::Encode; use cfgrammar::{ header::{ @@ -29,9 +15,23 @@ use lrpar::{ diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter}, }; use num_traits::{AsPrimitive, PrimInt, Unsigned}; -use proc_macro2::TokenStream; +use proc_macro2::{Ident, TokenStream}; use quote::{ToTokens, TokenStreamExt, format_ident, quote}; use regex::Regex; +use std::marker::PhantomData; +use std::{ + any::type_name, + borrow::Borrow, + collections::{HashMap, HashSet}, + env::{current_dir, var}, + error::Error, + fmt::{self, Debug, Display, Write as _}, + fs::{self, File, create_dir_all, read_to_string}, + hash::Hash, + io::Write, + path::{Path, PathBuf}, + sync::{LazyLock, Mutex}, +}; use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef}; @@ -1183,12 +1183,15 @@ impl CTLexer { } } -/// Create a Rust module named `mod_name` that can be imported with -/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per -/// token in `token_map`, with the token prefixed by `T_`. In addition, it will -/// contain an array of all token IDs `TOK_IDS`. +/// Exports all token IDs used by a parser as a separate Rust module. +/// +/// This builder will create a Rust module named `mod_name` +/// that can be imported with [`lrlex_mod!(mod_name)`](crate::lrlex_mod). +/// The module will contain one `const` `StorageT` per token in `token_map`, +/// with the token prefixed by `T_`. In addition, it will contain +/// an array of all token IDs `TOK_IDS`. /// -/// For example with `StorageT` `u8`, `mod_name` `x`, and `token_map` +/// For example, if `StorageT` is `u8`, `mod_name` is `x`, and `token_map` is /// `HashMap{"ID": 0, "INT": 1}` the generated module will look roughly as follows: /// /// ```rust,ignore @@ -1199,83 +1202,182 @@ impl CTLexer { /// } /// ``` /// -/// You can optionally remap names (for example, because the parser's token names do not lead to -/// valid Rust identifiers) by specifying the `rename_map` `HashMap`. For example, if `token_map` -/// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` then the generated -/// module will look roughly as follows: +/// See the [custom lexer example] for more usage details. /// -/// ```rust,ignore -/// mod x { -/// pub const T_PLUS: u8 = 0; -/// pub const T_ID: u8 = 1; -/// pub const TOK_IDS: &[u8] = &[T_PLUS, T_ID]; -/// } -/// ``` -pub fn ct_token_map( - mod_name: &str, - token_map: impl Borrow>, - rename_map: Option<&HashMap<&str, &str>>, -) -> Result<(), Box> { - // Record the time that this version of lrlex was built. If the source code changes and rustc - // forces a recompile, this will change this value, causing anything which depends on this - // build of lrlex to be recompiled too. - let mut outs = String::new(); - let timestamp = env!("VERGEN_BUILD_TIMESTAMP"); - let mod_ident = format_ident!("{}", mod_name); - write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok(); - let storaget = str::parse::(type_name::()).unwrap(); - // Sort the tokens so that they're always in the same order. - // This will prevent unneeded rebuilds. - let mut token_map_sorted = Vec::from_iter(token_map.borrow().iter()); - token_map_sorted.sort_by_key(|(k, _)| *k); - let (token_array, tokens): (TokenStream, TokenStream) = token_map_sorted - .iter() - .map(|(k, id)| { - let name = match rename_map { - Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()), - _ => &k, - }; - let tok_ident = format_ident!("T_{}", name.to_ascii_uppercase()); - ( - quote! { - #tok_ident, - }, - quote! { - pub const #tok_ident: #storaget = #id; - }, - ) - }) - .unzip(); - // Since the formatter doesn't preserve comments and we don't want to lose build time, - // just format the module contents. - let unformatted = quote! { - mod #mod_ident { - #![allow(dead_code)] - #tokens - pub const TOK_IDS: &[#storaget] = &[#token_array]; +/// [custom lexer example]: https://github.com/softdevteam/grmtools/tree/master/lrlex/examples/calc_manual_lex +#[derive(Debug, Clone)] +pub struct CTTokenMapBuilder { + mod_name: String, + token_map: Vec<(String, TokenStream)>, + rename_map: Option>, + allow_dead_code: bool, + _marker: PhantomData, +} + +impl CTTokenMapBuilder { + /// Create a new token map builder. + /// + /// See the [builder documentation] for more info. + /// + /// [builder documentation]: CTTokenMapBuilder + pub fn new( + mod_name: impl Into, + token_map: impl Borrow>, + ) -> Self { + Self { + mod_name: mod_name.into(), + token_map: token_map + .borrow() + .iter() + .map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream())) + .collect(), + rename_map: None, + allow_dead_code: false, + _marker: PhantomData, } } - .to_string(); - let out_mod = syn::parse_str(&unformatted) - .map(|syntax_tree| prettyplease::unparse(&syntax_tree)) - .unwrap_or(unformatted); - outs.push_str(&out_mod); - let mut outp = PathBuf::from(var("OUT_DIR")?); - outp.push(mod_name); - outp.set_extension("rs"); - - // If the file we're about to write out already exists with the same contents, then we - // don't overwrite it (since that will force a recompile of the file, and relinking of the - // binary etc). - if let Ok(curs) = read_to_string(&outp) { - if curs == outs { - return Ok(()); + + /// Set a token rename map. + /// + /// Rename map is used to specify identifier names for tokens whose names + /// are not valid Rust identifiers. For example, if `token_map` + /// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` + /// then the generated module will look roughly as follows: + /// + /// ```rust,ignore + /// mod x { + /// pub const T_PLUS: u8 = 0; + /// pub const T_ID: u8 = 1; + /// } + /// ``` + pub fn rename_map(mut self, rename_map: Option) -> Self + where + M: IntoIterator, + I: Borrow<(K, V)>, + K: AsRef, + V: AsRef, + { + self.rename_map = rename_map.map(|rename_map| { + rename_map + .into_iter() + .map(|it| { + let (k, v) = it.borrow(); + let k = k.as_ref().into(); + let v = v.as_ref().into(); + (k, v) + }) + .collect() + }); + self + } + + /// Control whether the builder will add `#[allow(dead_code)]` + /// to the generated module. + /// + /// By default, all tokens are `#[deny(dead_code)]`, meaning that you'll + /// get a warning if your custom lexer doesn't use any of them. + /// This function can be used to disable this behavior. + pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self { + self.allow_dead_code = allow_dead_code; + self + } + + /// Build the token map module. + pub fn build(&self) -> Result<(), Box> { + // Record the time that this version of lrlex was built. If the source code changes and rustc + // forces a recompile, this will change this value, causing anything which depends on this + // build of lrlex to be recompiled too. + let mut outs = String::new(); + let timestamp = env!("VERGEN_BUILD_TIMESTAMP"); + let mod_ident = format_ident!("{}", self.mod_name); + write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok(); + let storaget = str::parse::(type_name::()).unwrap(); + // Sort the tokens so that they're always in the same order. + // This will prevent unneeded rebuilds. + let mut token_map_sorted = self.token_map.clone(); + token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r)); + let (token_array, tokens) = token_map_sorted + .iter() + .map(|(k, id)| { + let name = match &self.rename_map { + Some(rmap) => rmap.get(k).unwrap_or(k), + _ => k, + }; + let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase())) + .map_err(|e| { + format!( + "token name {:?} is not a valid Rust identifier: {}; \ + consider renaming it via `CTTokenMapBuilder::rename_map`.", + name, e + ) + })?; + Ok(( + // Note: the array of all tokens can't use `tok_ident` because + // it will confuse the dead code checker. For this reason, + // we use `id` here. + quote! { + #id, + }, + quote! { + pub const #tok_ident: #storaget = #id; + }, + )) + }) + .collect::>>()?; + let unused_annotation; + if self.allow_dead_code { + unused_annotation = quote! {#[allow(dead_code)]}; + } else { + unused_annotation = quote! {}; + }; + // Since the formatter doesn't preserve comments and we don't want to lose build time, + // just format the module contents. + let unformatted = quote! { + #unused_annotation + mod #mod_ident { + #tokens + #[allow(dead_code)] + pub const TOK_IDS: &[#storaget] = &[#token_array]; + } + } + .to_string(); + let out_mod = syn::parse_str(&unformatted) + .map(|syntax_tree| prettyplease::unparse(&syntax_tree)) + .unwrap_or(unformatted); + outs.push_str(&out_mod); + let mut outp = PathBuf::from(var("OUT_DIR")?); + outp.push(&self.mod_name); + outp.set_extension("rs"); + + // If the file we're about to write out already exists with the same contents, then we + // don't overwrite it (since that will force a recompile of the file, and relinking of the + // binary etc). + if let Ok(curs) = read_to_string(&outp) { + if curs == outs { + return Ok(()); + } } + + let mut f = File::create(outp)?; + f.write_all(outs.as_bytes())?; + Ok(()) } +} - let mut f = File::create(outp)?; - f.write_all(outs.as_bytes())?; - Ok(()) +/// Create a Rust module named `mod_name` that can be imported with +/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). +/// +/// This function is deprecated in favour of [`CTTokenMapBuilder`]. +#[deprecated(since = "0.14", note = "use `lrlex::CTTokenMapBuilder` instead")] +pub fn ct_token_map( + mod_name: &str, + token_map: impl Borrow>, + rename_map: Option<&HashMap<&str, &str>>, +) -> Result<(), Box> { + CTTokenMapBuilder::new(mod_name, token_map) + .rename_map(rename_map) + .allow_dead_code(true) + .build() } /// Indents a multi-line string and trims any trailing newline. diff --git a/lrlex/src/lib/mod.rs b/lrlex/src/lib/mod.rs index 35352e681..f32cf641a 100644 --- a/lrlex/src/lib/mod.rs +++ b/lrlex/src/lib/mod.rs @@ -20,8 +20,12 @@ pub mod defaults; mod lexer; mod parser; +#[allow(deprecated)] pub use crate::{ - ctbuilder::{CTLexer, CTLexerBuilder, LexerKind, RustEdition, Visibility, ct_token_map}, + ctbuilder::{ + CTLexer, CTLexerBuilder, CTTokenMapBuilder, LexerKind, RustEdition, Visibility, + ct_token_map, + }, defaults::{DefaultLexeme, DefaultLexerTypes}, lexer::{ DEFAULT_LEX_FLAGS, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef, Rule,