From 248fe3f60befb30d209fe6d1153c7f000d579b23 Mon Sep 17 00:00:00 2001 From: Tamika Nomara Date: Fri, 23 May 2025 15:10:47 +0400 Subject: [PATCH] Export an array of all tokens from `ct_token_map` This helps with writing structured input adapters for fuzzing. When fuzzing a parser specifically (as opposed to fuzzing lexer and parser at the same time), we'd like to supply it with an array of valid lexemes. This export helps us build such an array as we don't have to manually list all tokens in a fuzzing entry point. Note that I didn't implement this functionality for generated lexers because there's already a way to get all tokens via `mod_l::lexerdef().iter_rules()`. --- lrlex/src/lib/ctbuilder.rs | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs index 31902fc39..644aba017 100644 --- a/lrlex/src/lib/ctbuilder.rs +++ b/lrlex/src/lib/ctbuilder.rs @@ -1181,14 +1181,17 @@ impl CTLexer { /// Create a Rust module named `mod_name` that can be imported with /// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per -/// token in `token_map`, with the token prefixed by `T_`. For example with `StorageT` `u8`, -/// `mod_name` `x`, and `token_map` `HashMap{"ID": 0, "INT": 1}` the generated module will look -/// roughly as follows: +/// token in `token_map`, with the token prefixed by `T_`. In addition, it will +/// contain an array of all token IDs `TOK_IDS`. +/// +/// For example with `StorageT` `u8`, `mod_name` `x`, and `token_map` +/// `HashMap{"ID": 0, "INT": 1}` the generated module will look roughly as follows: /// /// ```rust,ignore /// mod x { /// pub const T_ID: u8 = 0; /// pub const T_INT: u8 = 1; +/// pub const TOK_IDS: &[u8] = &[T_ID, T_INT]; /// } /// ``` /// @@ -1201,6 +1204,7 @@ impl CTLexer { /// mod x { /// pub const T_PLUS: u8 = 0; /// pub const T_ID: u8 = 1; +/// pub const TOK_IDS: &[u8] = &[T_PLUS, T_ID]; /// } /// ``` pub fn ct_token_map( @@ -1215,31 +1219,36 @@ pub fn ct_token_map( let timestamp = env!("VERGEN_BUILD_TIMESTAMP"); let mod_ident = format_ident!("{}", mod_name); write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok(); + let storaget = str::parse::(type_name::()).unwrap(); // Sort the tokens so that they're always in the same order. // This will prevent unneeded rebuilds. let mut token_map_sorted = Vec::from_iter(token_map.borrow().iter()); token_map_sorted.sort_by_key(|(k, _)| *k); - let tokens = &token_map_sorted - .into_iter() + let (token_array, tokens): (TokenStream, TokenStream) = token_map_sorted + .iter() .map(|(k, id)| { let name = match rename_map { Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()), - _ => k, + _ => &k, }; let tok_ident = format_ident!("T_{}", name.to_ascii_uppercase()); - let storaget = str::parse::(type_name::()).unwrap(); - // Code gen for the constant token values. - quote! { - pub const #tok_ident: #storaget = #id; - } + ( + quote! { + #tok_ident, + }, + quote! { + pub const #tok_ident: #storaget = #id; + }, + ) }) - .collect::>(); + .unzip(); // Since the formatter doesn't preserve comments and we don't want to lose build time, // just format the module contents. let unformatted = quote! { mod #mod_ident { #![allow(dead_code)] - #(#tokens)* + #tokens + pub const TOK_IDS: &[#storaget] = &[#token_array]; } } .to_string();