11//! Build grammars at run-time.
22
3- use std:: {
4- any:: type_name,
5- borrow:: Borrow ,
6- collections:: { HashMap , HashSet } ,
7- env:: { current_dir, var} ,
8- error:: Error ,
9- fmt:: { self , Debug , Display , Write as _} ,
10- fs:: { self , File , create_dir_all, read_to_string} ,
11- hash:: Hash ,
12- io:: Write ,
13- path:: { Path , PathBuf } ,
14- sync:: { LazyLock , Mutex } ,
15- } ;
16-
173use bincode:: Encode ;
184use cfgrammar:: {
195 header:: {
@@ -29,9 +15,23 @@ use lrpar::{
2915 diagnostics:: { DiagnosticFormatter , SpannedDiagnosticFormatter } ,
3016} ;
3117use num_traits:: { AsPrimitive , PrimInt , Unsigned } ;
32- use proc_macro2:: TokenStream ;
18+ use proc_macro2:: { Ident , TokenStream } ;
3319use quote:: { ToTokens , TokenStreamExt , format_ident, quote} ;
3420use regex:: Regex ;
21+ use std:: marker:: PhantomData ;
22+ use std:: {
23+ any:: type_name,
24+ borrow:: Borrow ,
25+ collections:: { HashMap , HashSet } ,
26+ env:: { current_dir, var} ,
27+ error:: Error ,
28+ fmt:: { self , Debug , Display , Write as _} ,
29+ fs:: { self , File , create_dir_all, read_to_string} ,
30+ hash:: Hash ,
31+ io:: Write ,
32+ path:: { Path , PathBuf } ,
33+ sync:: { LazyLock , Mutex } ,
34+ } ;
3535
3636use crate :: { DefaultLexerTypes , LRNonStreamingLexer , LRNonStreamingLexerDef , LexFlags , LexerDef } ;
3737
@@ -1183,12 +1183,15 @@ impl CTLexer {
11831183 }
11841184}
11851185
1186- /// Create a Rust module named `mod_name` that can be imported with
1187- /// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per
1188- /// token in `token_map`, with the token prefixed by `T_`. In addition, it will
1189- /// contain an array of all token IDs `TOK_IDS`.
1186+ /// Exports all token IDs used by a parser as a separate Rust module.
1187+ ///
1188+ /// This builder will create a Rust module named `mod_name`
1189+ /// that can be imported with [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1190+ /// The module will contain one `const` `StorageT` per token in `token_map`,
1191+ /// with the token prefixed by `T_`. In addition, it will contain
1192+ /// an array of all token IDs `TOK_IDS`.
11901193///
1191- /// For example with `StorageT` `u8`, `mod_name` `x`, and `token_map`
1194+ /// For example, if `StorageT` is `u8`, `mod_name` is `x`, and `token_map` is
11921195/// `HashMap{"ID": 0, "INT": 1}` the generated module will look roughly as follows:
11931196///
11941197/// ```rust,ignore
@@ -1199,83 +1202,182 @@ impl CTLexer {
11991202/// }
12001203/// ```
12011204///
1202- /// You can optionally remap names (for example, because the parser's token names do not lead to
1203- /// valid Rust identifiers) by specifying the `rename_map` `HashMap`. For example, if `token_map`
1204- /// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` then the generated
1205- /// module will look roughly as follows:
1205+ /// See the [custom lexer example] for more usage details.
12061206///
1207- /// ```rust,ignore
1208- /// mod x {
1209- /// pub const T_PLUS: u8 = 0;
1210- /// pub const T_ID: u8 = 1;
1211- /// pub const TOK_IDS: &[u8] = &[T_PLUS, T_ID];
1212- /// }
1213- /// ```
1214- pub fn ct_token_map < StorageT : Display + ToTokens > (
1215- mod_name : & str ,
1216- token_map : impl Borrow < HashMap < String , StorageT > > ,
1217- rename_map : Option < & HashMap < & str , & str > > ,
1218- ) -> Result < ( ) , Box < dyn Error > > {
1219- // Record the time that this version of lrlex was built. If the source code changes and rustc
1220- // forces a recompile, this will change this value, causing anything which depends on this
1221- // build of lrlex to be recompiled too.
1222- let mut outs = String :: new ( ) ;
1223- let timestamp = env ! ( "VERGEN_BUILD_TIMESTAMP" ) ;
1224- let mod_ident = format_ident ! ( "{}" , mod_name) ;
1225- write ! ( outs, "// lrlex build time: {}\n \n " , quote!( #timestamp) , ) . ok ( ) ;
1226- let storaget = str:: parse :: < TokenStream > ( type_name :: < StorageT > ( ) ) . unwrap ( ) ;
1227- // Sort the tokens so that they're always in the same order.
1228- // This will prevent unneeded rebuilds.
1229- let mut token_map_sorted = Vec :: from_iter ( token_map. borrow ( ) . iter ( ) ) ;
1230- token_map_sorted. sort_by_key ( |( k, _) | * k) ;
1231- let ( token_array, tokens) : ( TokenStream , TokenStream ) = token_map_sorted
1232- . iter ( )
1233- . map ( |( k, id) | {
1234- let name = match rename_map {
1235- Some ( rmap) => * rmap. get ( k. as_str ( ) ) . unwrap_or ( & k. as_str ( ) ) ,
1236- _ => & k,
1237- } ;
1238- let tok_ident = format_ident ! ( "T_{}" , name. to_ascii_uppercase( ) ) ;
1239- (
1240- quote ! {
1241- #tok_ident,
1242- } ,
1243- quote ! {
1244- pub const #tok_ident: #storaget = #id;
1245- } ,
1246- )
1247- } )
1248- . unzip ( ) ;
1249- // Since the formatter doesn't preserve comments and we don't want to lose build time,
1250- // just format the module contents.
1251- let unformatted = quote ! {
1252- mod #mod_ident {
1253- #![ allow( dead_code) ]
1254- #tokens
1255- pub const TOK_IDS : & [ #storaget] = & [ #token_array] ;
1207+ /// [custom lexer example]: https://github.com/softdevteam/grmtools/tree/master/lrlex/examples/calc_manual_lex
1208+ #[ derive( Debug , Clone ) ]
1209+ pub struct CTTokenMapBuilder < StorageT : Display + ToTokens > {
1210+ mod_name : String ,
1211+ token_map : Vec < ( String , TokenStream ) > ,
1212+ rename_map : Option < HashMap < String , String > > ,
1213+ allow_dead_code : bool ,
1214+ _marker : PhantomData < StorageT > ,
1215+ }
1216+
1217+ impl < StorageT : Display + ToTokens > CTTokenMapBuilder < StorageT > {
1218+ /// Create a new token map builder.
1219+ ///
1220+ /// See the [builder documentation] for more info.
1221+ ///
1222+ /// [builder documentation]: CTTokenMapBuilder
1223+ pub fn new (
1224+ mod_name : impl Into < String > ,
1225+ token_map : impl Borrow < HashMap < String , StorageT > > ,
1226+ ) -> Self {
1227+ Self {
1228+ mod_name : mod_name. into ( ) ,
1229+ token_map : token_map
1230+ . borrow ( )
1231+ . iter ( )
1232+ . map ( |( tok_name, tok_value) | ( tok_name. clone ( ) , tok_value. to_token_stream ( ) ) )
1233+ . collect ( ) ,
1234+ rename_map : None ,
1235+ allow_dead_code : false ,
1236+ _marker : PhantomData ,
12561237 }
12571238 }
1258- . to_string ( ) ;
1259- let out_mod = syn:: parse_str ( & unformatted)
1260- . map ( |syntax_tree| prettyplease:: unparse ( & syntax_tree) )
1261- . unwrap_or ( unformatted) ;
1262- outs. push_str ( & out_mod) ;
1263- let mut outp = PathBuf :: from ( var ( "OUT_DIR" ) ?) ;
1264- outp. push ( mod_name) ;
1265- outp. set_extension ( "rs" ) ;
1266-
1267- // If the file we're about to write out already exists with the same contents, then we
1268- // don't overwrite it (since that will force a recompile of the file, and relinking of the
1269- // binary etc).
1270- if let Ok ( curs) = read_to_string ( & outp) {
1271- if curs == outs {
1272- return Ok ( ( ) ) ;
1239+
1240+ /// Set a token rename map.
1241+ ///
1242+ /// Rename map is used to specify identifier names for tokens whose names
1243+ /// are not valid Rust identifiers. For example, if `token_map`
1244+ /// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}`
1245+ /// then the generated module will look roughly as follows:
1246+ ///
1247+ /// ```rust,ignore
1248+ /// mod x {
1249+ /// pub const T_PLUS: u8 = 0;
1250+ /// pub const T_ID: u8 = 1;
1251+ /// }
1252+ /// ```
1253+ pub fn rename_map < M , I , K , V > ( mut self , rename_map : Option < M > ) -> Self
1254+ where
1255+ M : IntoIterator < Item = I > ,
1256+ I : Borrow < ( K , V ) > ,
1257+ K : AsRef < str > ,
1258+ V : AsRef < str > ,
1259+ {
1260+ self . rename_map = rename_map. map ( |rename_map| {
1261+ rename_map
1262+ . into_iter ( )
1263+ . map ( |it| {
1264+ let ( k, v) = it. borrow ( ) ;
1265+ let k = k. as_ref ( ) . into ( ) ;
1266+ let v = v. as_ref ( ) . into ( ) ;
1267+ ( k, v)
1268+ } )
1269+ . collect ( )
1270+ } ) ;
1271+ self
1272+ }
1273+
1274+ /// Control whether the builder will add `#[allow(dead_code)]`
1275+ /// to the generated module.
1276+ ///
1277+ /// By default, all tokens are `#[deny(dead_code)]`, meaning that you'll
1278+ /// get a warning if your custom lexer doesn't use any of them.
1279+ /// This function can be used to disable this behavior.
1280+ pub fn allow_dead_code ( mut self , allow_dead_code : bool ) -> Self {
1281+ self . allow_dead_code = allow_dead_code;
1282+ self
1283+ }
1284+
1285+ /// Build the token map module.
1286+ pub fn build ( & self ) -> Result < ( ) , Box < dyn Error > > {
1287+ // Record the time that this version of lrlex was built. If the source code changes and rustc
1288+ // forces a recompile, this will change this value, causing anything which depends on this
1289+ // build of lrlex to be recompiled too.
1290+ let mut outs = String :: new ( ) ;
1291+ let timestamp = env ! ( "VERGEN_BUILD_TIMESTAMP" ) ;
1292+ let mod_ident = format_ident ! ( "{}" , self . mod_name) ;
1293+ write ! ( outs, "// lrlex build time: {}\n \n " , quote!( #timestamp) , ) . ok ( ) ;
1294+ let storaget = str:: parse :: < TokenStream > ( type_name :: < StorageT > ( ) ) . unwrap ( ) ;
1295+ // Sort the tokens so that they're always in the same order.
1296+ // This will prevent unneeded rebuilds.
1297+ let mut token_map_sorted = self . token_map . clone ( ) ;
1298+ token_map_sorted. sort_by ( |( l, _) , ( r, _) | l. cmp ( r) ) ;
1299+ let ( token_array, tokens) = token_map_sorted
1300+ . iter ( )
1301+ . map ( |( k, id) | {
1302+ let name = match & self . rename_map {
1303+ Some ( rmap) => rmap. get ( k) . unwrap_or ( k) ,
1304+ _ => k,
1305+ } ;
1306+ let tok_ident: Ident = syn:: parse_str ( & format ! ( "T_{}" , name. to_ascii_uppercase( ) ) )
1307+ . map_err ( |e| {
1308+ format ! (
1309+ "token name {:?} is not a valid Rust identifier: {}; \
1310+ consider renaming it via `CTTokenMapBuilder::rename_map`.",
1311+ name, e
1312+ )
1313+ } ) ?;
1314+ Ok ( (
1315+ // Note: the array of all tokens can't use `tok_ident` because
1316+ // it will confuse the dead code checker. For this reason,
1317+ // we use `id` here.
1318+ quote ! {
1319+ #id,
1320+ } ,
1321+ quote ! {
1322+ pub const #tok_ident: #storaget = #id;
1323+ } ,
1324+ ) )
1325+ } )
1326+ . collect :: < Result < ( TokenStream , TokenStream ) , Box < dyn Error > > > ( ) ?;
1327+ let unused_annotation;
1328+ if self . allow_dead_code {
1329+ unused_annotation = quote ! { #[ allow( dead_code) ] } ;
1330+ } else {
1331+ unused_annotation = quote ! { } ;
1332+ } ;
1333+ // Since the formatter doesn't preserve comments and we don't want to lose build time,
1334+ // just format the module contents.
1335+ let unformatted = quote ! {
1336+ #unused_annotation
1337+ mod #mod_ident {
1338+ #tokens
1339+ #[ allow( dead_code) ]
1340+ pub const TOK_IDS : & [ #storaget] = & [ #token_array] ;
1341+ }
1342+ }
1343+ . to_string ( ) ;
1344+ let out_mod = syn:: parse_str ( & unformatted)
1345+ . map ( |syntax_tree| prettyplease:: unparse ( & syntax_tree) )
1346+ . unwrap_or ( unformatted) ;
1347+ outs. push_str ( & out_mod) ;
1348+ let mut outp = PathBuf :: from ( var ( "OUT_DIR" ) ?) ;
1349+ outp. push ( & self . mod_name ) ;
1350+ outp. set_extension ( "rs" ) ;
1351+
1352+ // If the file we're about to write out already exists with the same contents, then we
1353+ // don't overwrite it (since that will force a recompile of the file, and relinking of the
1354+ // binary etc).
1355+ if let Ok ( curs) = read_to_string ( & outp) {
1356+ if curs == outs {
1357+ return Ok ( ( ) ) ;
1358+ }
12731359 }
1360+
1361+ let mut f = File :: create ( outp) ?;
1362+ f. write_all ( outs. as_bytes ( ) ) ?;
1363+ Ok ( ( ) )
12741364 }
1365+ }
12751366
1276- let mut f = File :: create ( outp) ?;
1277- f. write_all ( outs. as_bytes ( ) ) ?;
1278- Ok ( ( ) )
1367+ /// Create a Rust module named `mod_name` that can be imported with
1368+ /// [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1369+ ///
1370+ /// This function is deprecated in favour of [`CTTokenMapBuilder`].
1371+ #[ deprecated( since = "0.14" , note = "use `lrlex::CTTokenMapBuilder` instead" ) ]
1372+ pub fn ct_token_map < StorageT : Display + ToTokens > (
1373+ mod_name : & str ,
1374+ token_map : impl Borrow < HashMap < String , StorageT > > ,
1375+ rename_map : Option < & HashMap < & str , & str > > ,
1376+ ) -> Result < ( ) , Box < dyn Error > > {
1377+ CTTokenMapBuilder :: new ( mod_name, token_map)
1378+ . rename_map ( rename_map)
1379+ . allow_dead_code ( true )
1380+ . build ( )
12791381}
12801382
12811383/// Indents a multi-line string and trims any trailing newline.
0 commit comments