Skip to content

Commit 78a5fd2

Browse files
committed
Replace ct_token_map with CTTokenMapBuilder
This commit adds `CTTokenMapBuilder` that replaces `ct_token_map`. It provides a few new features: - generated module is no longer `#[allow(dead_code)]` unless manually enabled. This helps to spot errors in custom lexers where some of the tokens are never emitted; - if generated module contains invalid identifiers, the generation will fail and suggest using `CTTokenMapBuilder::rename_map`; - `CTTokenMapBuilder::rename_map` accepts any iterable that yields pairs of strings, eliminating the need to convert static arrays of pairs of strings before passing them to builder. The `ct_token_map` function is deprecated since the next minor release (`0.14`).
1 parent f7397a7 commit 78a5fd2

File tree

4 files changed

+210
-106
lines changed

4 files changed

+210
-106
lines changed

doc/src/manuallexer.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ and the boiler-plate that comes with it unwanted. Fortunately, `lrlex` provides
2828
[`lrpar::Lexeme`](https://softdevteam.github.io/grmtools/master/api/lrpar/trait.Lexeme.html)
2929
trait.
3030

31-
3. `lrlex` exposes a
32-
[`ct_token_map`](https://softdevteam.github.io/grmtools/master/api/lrlex/fn.ct_token_map.html)
33-
function to be used from `build.rs` scripts which automatically produces a
34-
Rust module with one constant per token ID. `ct_token_map` is explicitly
31+
3. `lrlex` exposes
32+
[`CTTokenMapBuilder`](https://softdevteam.github.io/grmtools/master/api/lrlex/struct.CTTokenMapBuilder.html)
33+
to be used from `build.rs` scripts which automatically produces a
34+
Rust module with one constant per token ID. It is explicitly
3535
designed to be easy to use with `lrpar`'s compile-time building.
3636

3737
Putting these together is then relatively easy. First a `build.rs` file for a
3838
hand-written lexer will look roughly as follows:
3939

4040
```rust
41-
use lrlex::{ct_token_map, DefaultLexerTypes};
41+
use lrlex::{CTTokenMapBuilder, DefaultLexerTypes};
4242
use lrpar::CTParserBuilder;
4343

4444
fn main() {
@@ -47,7 +47,7 @@ fn main() {
4747
.unwrap()
4848
.build()
4949
.unwrap();
50-
ct_token_map::<u8>("token_map", ctp.token_map(), None).unwrap()
50+
CTTokenMapBuilder::<u8>::new("token_map", ctp.token_map()).build().unwrap()
5151
}
5252
```
5353

@@ -65,7 +65,7 @@ Expr -> Result<u64, ()>:
6565
the module will contain `const T_PLUS: u8 = ...;`.
6666

6767
Since Yacc grammars can contain token identifiers which are not valid Rust
68-
identifiers, `ct_token_map` allows you to provide a map from the token
68+
identifiers, `CTTokenMapBuilder` allows you to provide a map from the token
6969
identifier to a "Rust friendly" variant. For example, for the following grammar
7070
excerpt:
7171

lrlex/examples/calc_manual_lex/build.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use lrlex::{DefaultLexerTypes, ct_token_map};
1+
use lrlex::{CTTokenMapBuilder, DefaultLexerTypes};
22
use lrpar::CTParserBuilder;
33

44
// Some of the token names in the parser do not lead to valid Rust identifiers, so we map them to
@@ -16,10 +16,8 @@ fn main() {
1616
.unwrap()
1717
.build()
1818
.unwrap();
19-
ct_token_map::<u8>(
20-
"token_map",
21-
ctp.token_map(),
22-
Some(&TOKENS_MAP.iter().cloned().collect()),
23-
)
24-
.unwrap();
19+
CTTokenMapBuilder::<u8>::new("token_map", ctp.token_map())
20+
.rename_map(Some(TOKENS_MAP))
21+
.build()
22+
.unwrap();
2523
}

lrlex/src/lib/ctbuilder.rs

Lines changed: 193 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,5 @@
11
//! Build grammars at run-time.
22
3-
use std::{
4-
any::type_name,
5-
borrow::Borrow,
6-
collections::{HashMap, HashSet},
7-
env::{current_dir, var},
8-
error::Error,
9-
fmt::{self, Debug, Display, Write as _},
10-
fs::{self, File, create_dir_all, read_to_string},
11-
hash::Hash,
12-
io::Write,
13-
path::{Path, PathBuf},
14-
sync::{LazyLock, Mutex},
15-
};
16-
173
use bincode::Encode;
184
use cfgrammar::{
195
header::{
@@ -29,9 +15,23 @@ use lrpar::{
2915
diagnostics::{DiagnosticFormatter, SpannedDiagnosticFormatter},
3016
};
3117
use num_traits::{AsPrimitive, PrimInt, Unsigned};
32-
use proc_macro2::TokenStream;
18+
use proc_macro2::{Ident, TokenStream};
3319
use quote::{ToTokens, TokenStreamExt, format_ident, quote};
3420
use regex::Regex;
21+
use std::marker::PhantomData;
22+
use std::{
23+
any::type_name,
24+
borrow::Borrow,
25+
collections::{HashMap, HashSet},
26+
env::{current_dir, var},
27+
error::Error,
28+
fmt::{self, Debug, Display, Write as _},
29+
fs::{self, File, create_dir_all, read_to_string},
30+
hash::Hash,
31+
io::Write,
32+
path::{Path, PathBuf},
33+
sync::{LazyLock, Mutex},
34+
};
3535

3636
use crate::{DefaultLexerTypes, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef};
3737

@@ -1183,12 +1183,15 @@ impl CTLexer {
11831183
}
11841184
}
11851185

1186-
/// Create a Rust module named `mod_name` that can be imported with
1187-
/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod). The module contains one `const` `StorageT` per
1188-
/// token in `token_map`, with the token prefixed by `T_`. In addition, it will
1189-
/// contain an array of all token IDs `TOK_IDS`.
1186+
/// Exports all token IDs used by a parser as a separate Rust module.
1187+
///
1188+
/// This builder will create a Rust module named `mod_name`
1189+
/// that can be imported with [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1190+
/// The module will contain one `const` `StorageT` per token in `token_map`,
1191+
/// with the token prefixed by `T_`. In addition, it will contain
1192+
/// an array of all token IDs `TOK_IDS`.
11901193
///
1191-
/// For example with `StorageT` `u8`, `mod_name` `x`, and `token_map`
1194+
/// For example, if `StorageT` is `u8`, `mod_name` is `x`, and `token_map` is
11921195
/// `HashMap{"ID": 0, "INT": 1}` the generated module will look roughly as follows:
11931196
///
11941197
/// ```rust,ignore
@@ -1199,83 +1202,182 @@ impl CTLexer {
11991202
/// }
12001203
/// ```
12011204
///
1202-
/// You can optionally remap names (for example, because the parser's token names do not lead to
1203-
/// valid Rust identifiers) by specifying the `rename_map` `HashMap`. For example, if `token_map`
1204-
/// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}` then the generated
1205-
/// module will look roughly as follows:
1205+
/// See the [custom lexer example] for more usage details.
12061206
///
1207-
/// ```rust,ignore
1208-
/// mod x {
1209-
/// pub const T_PLUS: u8 = 0;
1210-
/// pub const T_ID: u8 = 1;
1211-
/// pub const TOK_IDS: &[u8] = &[T_PLUS, T_ID];
1212-
/// }
1213-
/// ```
1214-
pub fn ct_token_map<StorageT: Display + ToTokens>(
1215-
mod_name: &str,
1216-
token_map: impl Borrow<HashMap<String, StorageT>>,
1217-
rename_map: Option<&HashMap<&str, &str>>,
1218-
) -> Result<(), Box<dyn Error>> {
1219-
// Record the time that this version of lrlex was built. If the source code changes and rustc
1220-
// forces a recompile, this will change this value, causing anything which depends on this
1221-
// build of lrlex to be recompiled too.
1222-
let mut outs = String::new();
1223-
let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1224-
let mod_ident = format_ident!("{}", mod_name);
1225-
write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
1226-
let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
1227-
// Sort the tokens so that they're always in the same order.
1228-
// This will prevent unneeded rebuilds.
1229-
let mut token_map_sorted = Vec::from_iter(token_map.borrow().iter());
1230-
token_map_sorted.sort_by_key(|(k, _)| *k);
1231-
let (token_array, tokens): (TokenStream, TokenStream) = token_map_sorted
1232-
.iter()
1233-
.map(|(k, id)| {
1234-
let name = match rename_map {
1235-
Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()),
1236-
_ => &k,
1237-
};
1238-
let tok_ident = format_ident!("T_{}", name.to_ascii_uppercase());
1239-
(
1240-
quote! {
1241-
#tok_ident,
1242-
},
1243-
quote! {
1244-
pub const #tok_ident: #storaget = #id;
1245-
},
1246-
)
1247-
})
1248-
.unzip();
1249-
// Since the formatter doesn't preserve comments and we don't want to lose build time,
1250-
// just format the module contents.
1251-
let unformatted = quote! {
1252-
mod #mod_ident {
1253-
#![allow(dead_code)]
1254-
#tokens
1255-
pub const TOK_IDS: &[#storaget] = &[#token_array];
1207+
/// [custom lexer example]: https://github.com/softdevteam/grmtools/tree/master/lrlex/examples/calc_manual_lex
1208+
#[derive(Debug, Clone)]
1209+
pub struct CTTokenMapBuilder<StorageT: Display + ToTokens> {
1210+
mod_name: String,
1211+
token_map: Vec<(String, TokenStream)>,
1212+
rename_map: Option<HashMap<String, String>>,
1213+
allow_dead_code: bool,
1214+
_marker: PhantomData<StorageT>,
1215+
}
1216+
1217+
impl<StorageT: Display + ToTokens> CTTokenMapBuilder<StorageT> {
1218+
/// Create a new token map builder.
1219+
///
1220+
/// See the [builder documentation] for more info.
1221+
///
1222+
/// [builder documentation]: CTTokenMapBuilder
1223+
pub fn new(
1224+
mod_name: impl Into<String>,
1225+
token_map: impl Borrow<HashMap<String, StorageT>>,
1226+
) -> Self {
1227+
Self {
1228+
mod_name: mod_name.into(),
1229+
token_map: token_map
1230+
.borrow()
1231+
.iter()
1232+
.map(|(tok_name, tok_value)| (tok_name.clone(), tok_value.to_token_stream()))
1233+
.collect(),
1234+
rename_map: None,
1235+
allow_dead_code: false,
1236+
_marker: PhantomData,
12561237
}
12571238
}
1258-
.to_string();
1259-
let out_mod = syn::parse_str(&unformatted)
1260-
.map(|syntax_tree| prettyplease::unparse(&syntax_tree))
1261-
.unwrap_or(unformatted);
1262-
outs.push_str(&out_mod);
1263-
let mut outp = PathBuf::from(var("OUT_DIR")?);
1264-
outp.push(mod_name);
1265-
outp.set_extension("rs");
1266-
1267-
// If the file we're about to write out already exists with the same contents, then we
1268-
// don't overwrite it (since that will force a recompile of the file, and relinking of the
1269-
// binary etc).
1270-
if let Ok(curs) = read_to_string(&outp) {
1271-
if curs == outs {
1272-
return Ok(());
1239+
1240+
/// Set a token rename map.
1241+
///
1242+
/// Rename map is used to specify identifier names for tokens whose names
1243+
/// are not valid Rust identifiers. For example, if `token_map`
1244+
/// is `HashMap{"+": 0, "ID": 1}` and `rename_map` is `HashMap{"+": "PLUS"}`
1245+
/// then the generated module will look roughly as follows:
1246+
///
1247+
/// ```rust,ignore
1248+
/// mod x {
1249+
/// pub const T_PLUS: u8 = 0;
1250+
/// pub const T_ID: u8 = 1;
1251+
/// }
1252+
/// ```
1253+
pub fn rename_map<M, I, K, V>(mut self, rename_map: Option<M>) -> Self
1254+
where
1255+
M: IntoIterator<Item = I>,
1256+
I: Borrow<(K, V)>,
1257+
K: AsRef<str>,
1258+
V: AsRef<str>,
1259+
{
1260+
self.rename_map = rename_map.map(|rename_map| {
1261+
rename_map
1262+
.into_iter()
1263+
.map(|it| {
1264+
let (k, v) = it.borrow();
1265+
let k = k.as_ref().into();
1266+
let v = v.as_ref().into();
1267+
(k, v)
1268+
})
1269+
.collect()
1270+
});
1271+
self
1272+
}
1273+
1274+
/// Control whether the builder will add `#[allow(dead_code)]`
1275+
/// to the generated module.
1276+
///
1277+
/// By default, all tokens are `#[deny(dead_code)]`, meaning that you'll
1278+
/// get a warning if your custom lexer doesn't use any of them.
1279+
/// This function can be used to disable this behavior.
1280+
pub fn allow_dead_code(mut self, allow_dead_code: bool) -> Self {
1281+
self.allow_dead_code = allow_dead_code;
1282+
self
1283+
}
1284+
1285+
/// Build the token map module.
1286+
pub fn build(&self) -> Result<(), Box<dyn Error>> {
1287+
// Record the time that this version of lrlex was built. If the source code changes and rustc
1288+
// forces a recompile, this will change this value, causing anything which depends on this
1289+
// build of lrlex to be recompiled too.
1290+
let mut outs = String::new();
1291+
let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
1292+
let mod_ident = format_ident!("{}", self.mod_name);
1293+
write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
1294+
let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
1295+
// Sort the tokens so that they're always in the same order.
1296+
// This will prevent unneeded rebuilds.
1297+
let mut token_map_sorted = self.token_map.clone();
1298+
token_map_sorted.sort_by(|(l, _), (r, _)| l.cmp(r));
1299+
let (token_array, tokens) = token_map_sorted
1300+
.iter()
1301+
.map(|(k, id)| {
1302+
let name = match &self.rename_map {
1303+
Some(rmap) => rmap.get(k).unwrap_or(k),
1304+
_ => k,
1305+
};
1306+
let tok_ident: Ident = syn::parse_str(&format!("T_{}", name.to_ascii_uppercase()))
1307+
.map_err(|e| {
1308+
format!(
1309+
"token name {:?} is not a valid Rust identifier: {}; \
1310+
consider renaming it via `CTTokenMapBuilder::rename_map`.",
1311+
name, e
1312+
)
1313+
})?;
1314+
Ok((
1315+
// Note: the array of all tokens can't use `tok_ident` because
1316+
// it will confuse the dead code checker. For this reason,
1317+
// we use `id` here.
1318+
quote! {
1319+
#id,
1320+
},
1321+
quote! {
1322+
pub const #tok_ident: #storaget = #id;
1323+
},
1324+
))
1325+
})
1326+
.collect::<Result<(TokenStream, TokenStream), Box<dyn Error>>>()?;
1327+
let unused_annotation;
1328+
if self.allow_dead_code {
1329+
unused_annotation = quote! {#[allow(dead_code)]};
1330+
} else {
1331+
unused_annotation = quote! {};
1332+
};
1333+
// Since the formatter doesn't preserve comments and we don't want to lose build time,
1334+
// just format the module contents.
1335+
let unformatted = quote! {
1336+
#unused_annotation
1337+
mod #mod_ident {
1338+
#tokens
1339+
#[allow(dead_code)]
1340+
pub const TOK_IDS: &[#storaget] = &[#token_array];
1341+
}
1342+
}
1343+
.to_string();
1344+
let out_mod = syn::parse_str(&unformatted)
1345+
.map(|syntax_tree| prettyplease::unparse(&syntax_tree))
1346+
.unwrap_or(unformatted);
1347+
outs.push_str(&out_mod);
1348+
let mut outp = PathBuf::from(var("OUT_DIR")?);
1349+
outp.push(&self.mod_name);
1350+
outp.set_extension("rs");
1351+
1352+
// If the file we're about to write out already exists with the same contents, then we
1353+
// don't overwrite it (since that will force a recompile of the file, and relinking of the
1354+
// binary etc).
1355+
if let Ok(curs) = read_to_string(&outp) {
1356+
if curs == outs {
1357+
return Ok(());
1358+
}
12731359
}
1360+
1361+
let mut f = File::create(outp)?;
1362+
f.write_all(outs.as_bytes())?;
1363+
Ok(())
12741364
}
1365+
}
12751366

1276-
let mut f = File::create(outp)?;
1277-
f.write_all(outs.as_bytes())?;
1278-
Ok(())
1367+
/// Create a Rust module named `mod_name` that can be imported with
1368+
/// [`lrlex_mod!(mod_name)`](crate::lrlex_mod).
1369+
///
1370+
/// This function is deprecated in favour of [`CTTokenMapBuilder`].
1371+
#[deprecated(since = "0.14", note = "use `lrlex::CTTokenMapBuilder` instead")]
1372+
pub fn ct_token_map<StorageT: Display + ToTokens>(
1373+
mod_name: &str,
1374+
token_map: impl Borrow<HashMap<String, StorageT>>,
1375+
rename_map: Option<&HashMap<&str, &str>>,
1376+
) -> Result<(), Box<dyn Error>> {
1377+
CTTokenMapBuilder::new(mod_name, token_map)
1378+
.rename_map(rename_map)
1379+
.allow_dead_code(true)
1380+
.build()
12791381
}
12801382

12811383
/// Indents a multi-line string and trims any trailing newline.

lrlex/src/lib/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@ pub mod defaults;
2020
mod lexer;
2121
mod parser;
2222

23+
#[allow(deprecated)]
2324
pub use crate::{
24-
ctbuilder::{CTLexer, CTLexerBuilder, LexerKind, RustEdition, Visibility, ct_token_map},
25+
ctbuilder::{
26+
CTLexer, CTLexerBuilder, CTTokenMapBuilder, LexerKind, RustEdition, Visibility,
27+
ct_token_map,
28+
},
2529
defaults::{DefaultLexeme, DefaultLexerTypes},
2630
lexer::{
2731
DEFAULT_LEX_FLAGS, LRNonStreamingLexer, LRNonStreamingLexerDef, LexFlags, LexerDef, Rule,

0 commit comments

Comments
 (0)