Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ sparsevec = "0.2"
static_assertions = "1.1"
unicode-width = "0.1.11"
vob = ">=3.0.2"
proc-macro2 = "1.0"
2 changes: 2 additions & 0 deletions cfgrammar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ num-traits.workspace = true
regex.workspace = true
serde = { workspace = true, features = ["derive"], optional = true }
vob = { workspace = true, features = ["serde"] }
quote.workspace = true
proc-macro2.workspace = true
9 changes: 9 additions & 0 deletions cfgrammar/src/lib/span.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use proc_macro2::TokenStream;
use quote::{quote, ToTokens, TokenStreamExt};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -54,3 +56,10 @@ pub trait Spanned: std::fmt::Display {
/// Returns the `SpansKind` associated with this error.
fn spanskind(&self) -> crate::yacc::parser::SpansKind;
}

impl ToTokens for Span {
fn to_tokens(&self, tokens: &mut TokenStream) {
let Span { start, end } = self;
tokens.append_all(quote! {::cfgrammar::Span::new(#start, #end)});
}
}
30 changes: 30 additions & 0 deletions cfgrammar/src/lib/yacc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pub use self::{
grammar::{AssocKind, Precedence, SentenceGenerator, YaccGrammar},
parser::{YaccGrammarError, YaccGrammarErrorKind, YaccGrammarWarning, YaccGrammarWarningKind},
};
use proc_macro2::TokenStream;
use quote::quote;

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -39,6 +41,18 @@ pub enum YaccKind {
Eco,
}

impl quote::ToTokens for YaccKind {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.extend(match *self {
YaccKind::Grmtools => quote!(::cfgrammar::yacc::YaccKind::Grmtools),
YaccKind::Original(action_kind) => {
quote!(::cfgrammar::yacc::YaccKind::Original(#action_kind))
}
YaccKind::Eco => quote!(::cfgrammar::yacc::YaccKind::Eco),
})
}
}

#[derive(Clone, Copy, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum YaccOriginalActionKind {
Expand All @@ -50,3 +64,19 @@ pub enum YaccOriginalActionKind {
/// Do not do execute actions of any sort.
NoAction,
}

impl quote::ToTokens for YaccOriginalActionKind {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.extend(match *self {
YaccOriginalActionKind::UserAction => {
quote!(::cfgrammar::yacc::YaccOriginalActionKind::UserAction)
}
YaccOriginalActionKind::GenericParseTree => {
quote!(::cfgrammar::yacc::YaccOriginalActionKind::GenericParseTree)
}
YaccOriginalActionKind::NoAction => {
quote!(::cfgrammar::yacc::YaccOriginalActionKind::NoAction)
}
})
}
}
1 change: 1 addition & 0 deletions lrlex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ lazy_static.workspace = true
regex.workspace = true
regex-syntax.workspace = true
num-traits.workspace = true
proc-macro2.workspace = true
quote.workspace = true
serde.workspace = true
144 changes: 95 additions & 49 deletions lrlex/src/lib/ctbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ use cfgrammar::{newlinecache::NewlineCache, Spanned};
use lazy_static::lazy_static;
use lrpar::{CTParserBuilder, LexerTypes};
use num_traits::{AsPrimitive, PrimInt, Unsigned};
use quote::quote;
use proc_macro2::TokenStream;
use quote::{quote, ToTokens, TokenStreamExt};
use regex::Regex;
use serde::Serialize;

Expand Down Expand Up @@ -78,11 +79,48 @@ pub enum RustEdition {
Rust2021,
}

/// The quote impl of `ToTokens` for `Option` prints an empty string for `None`
/// and the inner value for `Some(inner_value)`.
///
/// This wrapper instead emits both `Some` and `None` variants.
/// See: [quote #20](https://github.com/dtolnay/quote/issues/20)
struct QuoteOption<T>(Option<T>);

impl<T: ToTokens> ToTokens for QuoteOption<T> {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.append_all(match self.0 {
Some(ref t) => quote! { ::std::option::Option::Some(#t) },
None => quote! { ::std::option::Option::None },
});
}
}

/// This wrapper adds a missing impl of `ToTokens` for tuples.
/// For a tuple `(a, b)` emits `(a.to_tokens(), b.to_tokens())`
struct QuoteTuple<T>(T);

impl<A: ToTokens, B: ToTokens> ToTokens for QuoteTuple<(A, B)> {
fn to_tokens(&self, tokens: &mut TokenStream) {
let (a, b) = &self.0;
tokens.append_all(quote!((#a, #b)));
}
}

/// The wrapped `&str` value will be emitted with a call to `to_string()`
struct QuoteToString<'a>(&'a str);

impl ToTokens for QuoteToString<'_> {
fn to_tokens(&self, tokens: &mut TokenStream) {
let x = &self.0;
tokens.append_all(quote! { #x.to_string() });
}
}

/// A `CTLexerBuilder` allows one to specify the criteria for building a statically generated
/// lexer.
pub struct CTLexerBuilder<'a, LexerTypesT: LexerTypes = DefaultLexerTypes<u32>>
where
LexerTypesT::StorageT: Debug + Eq + Hash,
LexerTypesT::StorageT: Debug + Eq + Hash + ToTokens,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me this here, and the related bounds immediately below this are the most controversial aspects to this patch,
and the part that we need to consider most carefully.

The other thing we could do here is change this ToTokens to Display,

Changing it to ToTokens kind of just enables one minor cleanup, the QuoteOption(tok_id) cleanup in 18f1d9d

I think it would definitely be fine to consider using Display here instead of ToTokens and dropping that last cleanup?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more thing to note here, which I had mentioned in a previous comment on usize,
using ToTokens here also has the effect of including the type in the value e.g. this patch is currently encoding StorageT values as Some (5u32)), a change from Some(5) with Debug and Display.

Looking at the sources generated by the testsuite these StorageT values i'm seeing are all using u32.
So if we do go with ToTokens it would be good to try and exercise this in the testsuite in a way that uses another type here u8.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a very strong opinion here, but I'm somewhat leaning towards liking the 1_u8 form of ToTokens.

I tested the codegen of ToTokens here, by changing the lrpar/examples/calc_actions/build.rs to use
CTLexerBuilder::<DefaultLexerTypes<u8>>::new_with_lexemet() and it makes sense to me that in theory it should be fine for StorageT to include the concrete type in literals, since kind of the point is that it has a concrete size known to the caller.

Other places where we rely on the can be converted into a usize losslessly we could potentially use a newtype around known integer types to print literals without the type suffix.

Either way, since we didn't remove the Debug suffix, at worst we can go back to using Debug?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with having the type suffix for literals.

One thing I haven't fully understood is: what types is ToTokens implemented on? I assume it's implemented on all primitive integers by quote? In that sense, this isn't going to be much of a breaking change unless a user does something odd like pass a newtype here: that would force them to know about the ToTokens trait and quote. In that sense, Display would be preferable, but I take your point in 18f1d9d that Display means we end up having to do our own string escaping, which is a bit nasty. On that basis, I'm fine using ToTokens.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's basically all the concete value types... bool, numerical, strings, option + reference types (minus Arc), some macro specific types that represent tokens.

It's sort of just a hodgepodge of various types, I don't see any reason why some things seem to be missing (like tuples, Arc)

The full list is here:
https://docs.rs/quote/latest/quote/trait.ToTokens.html#foreign-impls

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So long as the integer types are there, I think we'll have covered nearly every sensible use that I can conceive of.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The one thing I guess I would say is that for the specific case of StorageT we don't have to worry about string escaping at all, I think it is also probably the most stable of all format printing we could likely expect format printing to be. So I don't think it is too far fetched to use ToTokens internally, but not add the bounds for StorageT and use Debug or Display just for that.

So to me ToTokens over Debug or Display for StorageT mostly revolves around whether we want to be able to include the type suffix in integer literals. It isn't a huge thing but maybe adds a little bit of type checking we don't currently get (for better or worse!).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If our plan is to use quote (and friends) for all code generation inside grmtools, I think the ToTokens bound makes sense: we'll have bought into that ecosystem, and might as well take advantage of it, and make clear that we've done so to users. If we don't plan on going on all the way with quote then perhaps we should just fall back on Display.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, I'd probably stick with the ToTokens bound then since it seems like using it throughout will allow us to take advantage of it more heavily, by implementing ToTokens for things that contain a StorageT.
Otherwise we're kind of doomed to a mixture of the quote ecosystem plus manually formatted values.

usize: num_traits::AsPrimitive<LexerTypesT::StorageT>,
{
lrpar_config: Option<Box<dyn Fn(CTParserBuilder<LexerTypesT>) -> CTParserBuilder<LexerTypesT>>>,
Expand All @@ -108,7 +146,7 @@ impl CTLexerBuilder<'_, DefaultLexerTypes<u32>> {
impl<'a, LexerTypesT: LexerTypes> CTLexerBuilder<'a, LexerTypesT>
where
LexerTypesT::StorageT:
'static + Debug + Eq + Hash + PrimInt + Serialize + TryFrom<usize> + Unsigned,
'static + Debug + Eq + Hash + PrimInt + Serialize + TryFrom<usize> + Unsigned + ToTokens,
usize: AsPrimitive<LexerTypesT::StorageT>,
{
/// Create a new [CTLexerBuilder].
Expand Down Expand Up @@ -438,31 +476,52 @@ pub fn lexerdef() -> {lexerdef_type} {{
)
.ok();

let RegexOptions {
dot_matches_new_line,
multi_line,
octal,
posix_escapes,
case_insensitive,
unicode,
swap_greed,
ignore_whitespace,
size_limit,
dfa_size_limit,
nest_limit,
} = self.regex_options;
let case_insensitive = QuoteOption(case_insensitive);
let unicode = QuoteOption(unicode);
let swap_greed = QuoteOption(swap_greed);
let ignore_whitespace = QuoteOption(ignore_whitespace);
let size_limit = QuoteOption(size_limit);
let dfa_size_limit = QuoteOption(dfa_size_limit);
let nest_limit = QuoteOption(nest_limit);

outs.push_str(&format!(
"let regex_options = ::lrlex::RegexOptions {{
dot_matches_new_line: {dot_matches_new_line:?},
multi_line: {multi_line:?},
octal: {octal:?},
posix_escapes: {posix_escapes:?},
case_insensitive: {case_insensitive:?},
unicode: {unicode:?},
swap_greed: {swap_greed:?},
ignore_whitespace: {ignore_whitespace:?},
size_limit: {size_limit:?},
dfa_size_limit: {dfa_size_limit:?},
nest_limit: {nest_limit:?},
dot_matches_new_line: {dot_matches_new_line},
multi_line: {multi_line},
octal: {octal},
posix_escapes: {posix_escapes},
case_insensitive: {case_insensitive},
unicode: {unicode},
swap_greed: {swap_greed},
ignore_whitespace: {ignore_whitespace},
size_limit: {size_limit},
dfa_size_limit: {dfa_size_limit},
nest_limit: {nest_limit},
}};",
dot_matches_new_line = self.regex_options.dot_matches_new_line,
multi_line = self.regex_options.multi_line,
octal = self.regex_options.octal,
posix_escapes = self.regex_options.posix_escapes,
case_insensitive = self.regex_options.case_insensitive,
unicode = self.regex_options.unicode,
swap_greed = self.regex_options.swap_greed,
ignore_whitespace = self.regex_options.ignore_whitespace,
size_limit = self.regex_options.size_limit,
dfa_size_limit = self.regex_options.dfa_size_limit,
nest_limit = self.regex_options.nest_limit,
dot_matches_new_line = quote!(#dot_matches_new_line),
multi_line = quote!(#multi_line),
octal = quote!(#octal),
posix_escapes = quote!(#posix_escapes),
case_insensitive = quote!(#case_insensitive),
unicode = quote!(#unicode),
swap_greed = quote!(#swap_greed),
ignore_whitespace = quote!(#ignore_whitespace),
size_limit = quote!(#size_limit),
dfa_size_limit = quote!(#dfa_size_limit),
nest_limit = quote!(#nest_limit),
));

outs.push_str(" let start_states: Vec<StartState> = vec![");
Expand All @@ -485,35 +544,22 @@ pub fn lexerdef() -> {lexerdef_type} {{

// Individual rules
for r in lexerdef.iter_rules() {
let tok_id = match r.tok_id {
Some(ref t) => format!("Some({:?})", t),
None => "None".to_owned(),
};
let n = match r.name() {
Some(ref n) => format!("Some({}.to_string())", quote!(#n)),
None => "None".to_owned(),
};
let target_state = match &r.target_state() {
Some((id, op)) => format!("Some(({}, ::lrlex::StartStateOperation::{:?}))", id, op),
None => "None".to_owned(),
};
let n_span = format!(
"::cfgrammar::Span::new({}, {})",
r.name_span().start(),
r.name_span().end()
);
let regex = &r.re_str;
let tok_id = QuoteOption(r.tok_id);
let n = QuoteOption(r.name().map(QuoteToString));
let target_state = QuoteOption(r.target_state().map(|(x, y)| QuoteTuple((x, y))));
let n_span = r.name_span();
let regex = QuoteToString(&r.re_str);
let start_states = r.start_states();
write!(
outs,
"
Rule::new(::lrlex::unstable_api::InternalPublicApi, {}, {}, {}, {}.to_string(), {}.to_vec(), {}, &regex_options).unwrap(),",
tok_id,
n,
n_span,
quote!(#tok_id),
quote!(#n),
quote!(#n_span),
quote!(#regex),
quote!([#(#start_states),*]),
target_state,
quote!(#target_state),
)
.ok();
}
Expand All @@ -537,10 +583,10 @@ pub fn lexerdef() -> {lexerdef_type} {{
if RE_TOKEN_ID.is_match(n) {
write!(
outs,
"#[allow(dead_code)]\npub const T_{}: {} = {:?};\n",
"#[allow(dead_code)]\npub const T_{}: {} = {};\n",
n.to_ascii_uppercase(),
type_name::<LexerTypesT::StorageT>(),
*id
quote!(#id)
)
.ok();
}
Expand Down
12 changes: 12 additions & 0 deletions lrlex/src/lib/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ pub enum StartStateOperation {
Pop,
}

use proc_macro2::TokenStream;
use quote::quote;
impl quote::ToTokens for StartStateOperation {
fn to_tokens(&self, tokens: &mut TokenStream) {
tokens.extend(match *self {
StartStateOperation::ReplaceStack => quote!(::lrlex::StartStateOperation::ReplaceStack),
StartStateOperation::Push => quote!(::lrlex::StartStateOperation::Push),
StartStateOperation::Pop => quote!(::lrlex::StartStateOperation::Pop),
})
}
}

pub(super) struct LexParser<LexerTypesT: LexerTypes>
where
usize: AsPrimitive<LexerTypesT::StorageT>,
Expand Down
4 changes: 3 additions & 1 deletion lrpar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ indexmap.workspace = true
lazy_static.workspace = true
num-traits.workspace = true
packedvec.workspace = true
proc-macro2.workspace = true
quote.workspace = true
regex.workspace = true
serde = { workspace = true, features = ["derive"] }
static_assertions.workspace = true
vob.workspace = true
regex.workspace = true

[dev-dependencies]
tempfile = "3.0"
Loading