Skip to content

Commit a1ad58d

Browse files
committed
Merge branch '2024-09_pr126452'
2 parents 11d9dba + 9b52f34 commit a1ad58d

16 files changed

+160
-18
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
This repository includes:
22

3-
* a detailed description of the Rust 1.80 lexer (in `writeup`)
3+
* a detailed description of the proposed Rust 1.83 lexer (in `writeup`)
44
* a Rust reimplementation of the lexer based on that description (in `src`)
55
* a manual list of testcases
66
* a harness for running `rustc`'s lexer in-process (via `rustc_private`)

src/combination.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ pub enum CoarseTokenData {
5757
LifetimeOrLabel {
5858
name: Charseq,
5959
},
60+
RawLifetimeOrLabel {
61+
name: Charseq,
62+
},
6063
ByteLiteral {
6164
represented_byte: u8,
6265
suffix: Charseq,
@@ -327,6 +330,9 @@ impl TryFrom<FineTokenData> for CoarseTokenData {
327330
FineTokenData::LifetimeOrLabel { name } => {
328331
Ok(CoarseTokenData::LifetimeOrLabel { name })
329332
}
333+
FineTokenData::RawLifetimeOrLabel { name } => {
334+
Ok(CoarseTokenData::RawLifetimeOrLabel { name })
335+
}
330336
FineTokenData::ByteLiteral {
331337
represented_byte,
332338
suffix,

src/lex_via_rustc.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ pub enum RustcTokenData {
7878
identifier: String,
7979
},
8080
Lifetime {
81+
style: RustcIdentIsRaw,
8182
/// This includes the leading '
8283
symbol: String,
8384
},
@@ -133,7 +134,7 @@ pub enum RustcDocCommentStyle {
133134
Outer,
134135
}
135136

136-
/// Whether an identifier was written in raw form.
137+
/// Whether an identifier or lifetime/label was written in raw form.
137138
pub enum RustcIdentIsRaw {
138139
No,
139140
Yes,
@@ -398,7 +399,8 @@ fn token_from_ast_token(
398399
style: style.into(),
399400
identifier: symbol.to_string(),
400401
},
401-
TokenKind::Lifetime(symbol, _is_raw) => RustcTokenData::Lifetime {
402+
TokenKind::Lifetime(symbol, style) => RustcTokenData::Lifetime {
403+
style: style.into(),
402404
symbol: symbol.to_string(),
403405
},
404406
TokenKind::Literal(rustc_ast::token::Lit {

src/lexlucid/pretokenisation.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ pub enum PretokenData {
5858
LifetimeOrLabel {
5959
name: Charseq,
6060
},
61+
RawLifetimeOrLabel {
62+
name: Charseq,
63+
},
6164
SingleQuoteLiteral {
6265
prefix: Charseq,
6366
literal_content: Charseq,

src/lexlucid/pretokenisation/pretokenisation_rules.rs

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ enum RuleName {
2929
UnterminatedBlockComment,
3030
Punctuation,
3131
SingleQuotedLiteral,
32-
LifetimeOrLabel,
32+
RawLifetimeOrLabel2021,
33+
ReservedLifetimeOrLabelPrefix2021,
34+
NonRawLifetimeOrLabel,
3335
DoublequotedNonrawLiteral2015,
3436
DoublequotedNonrawLiteral2021,
3537
DoublequotedHashlessRawLiteral2015,
@@ -56,7 +58,7 @@ const RULES_FOR_EDITION_2015: &[RuleName] = [
5658
RuleName::UnterminatedBlockComment,
5759
RuleName::Punctuation,
5860
RuleName::SingleQuotedLiteral,
59-
RuleName::LifetimeOrLabel,
61+
RuleName::NonRawLifetimeOrLabel,
6062
RuleName::DoublequotedNonrawLiteral2015,
6163
RuleName::DoublequotedHashlessRawLiteral2015,
6264
RuleName::DoublequotedHashedRawLiteral2015,
@@ -80,7 +82,9 @@ const RULES_FOR_EDITION_2021: &[RuleName] = [
8082
RuleName::UnterminatedBlockComment,
8183
RuleName::Punctuation,
8284
RuleName::SingleQuotedLiteral,
83-
RuleName::LifetimeOrLabel,
85+
RuleName::RawLifetimeOrLabel2021,
86+
RuleName::ReservedLifetimeOrLabelPrefix2021,
87+
RuleName::NonRawLifetimeOrLabel,
8488
RuleName::DoublequotedNonrawLiteral2021,
8589
RuleName::DoublequotedHashlessRawLiteral2021,
8690
RuleName::DoublequotedHashedRawLiteral2021,
@@ -183,7 +187,30 @@ fn make_named_rules() -> BTreeMap<RuleName, Rule> {
183187
"##)),
184188

185189
// Lifetime or label
186-
(RuleName::LifetimeOrLabel,
190+
(RuleName::RawLifetimeOrLabel2021,
191+
Rule::new_regex(
192+
|cp| PretokenData::RawLifetimeOrLabel {
193+
name: cp["name"].into(),
194+
}, r##"\A
195+
' r \#
196+
(?<name>
197+
[ \p{XID_Start} _ ]
198+
\p{XID_Continue} *
199+
)
200+
"##)),
201+
202+
// Reserved lifetime or label prefix
203+
(RuleName::ReservedLifetimeOrLabelPrefix2021,
204+
Rule::new_regex(
205+
|_| PretokenData::Reserved, r##"\A
206+
'
207+
[ \p{XID_Start} _ ]
208+
\p{XID_Continue} *
209+
\#
210+
"##)),
211+
212+
// Lifetime or label
213+
(RuleName::NonRawLifetimeOrLabel,
187214
Rule::new_regex_with_forbidden_follower(
188215
|cp| PretokenData::LifetimeOrLabel {
189216
name: cp["name"].into(),

src/lexlucid/reprocessing.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ pub enum FineTokenData {
5353
LifetimeOrLabel {
5454
name: Charseq,
5555
},
56+
RawLifetimeOrLabel {
57+
name: Charseq,
58+
},
5659
CharacterLiteral {
5760
represented_character: char,
5861
suffix: Charseq,
@@ -158,6 +161,9 @@ pub fn reprocess(pretoken: &Pretoken) -> Result<FineToken, Error> {
158161
PretokenData::LifetimeOrLabel { name } => {
159162
FineTokenData::LifetimeOrLabel { name: name.clone() }
160163
}
164+
PretokenData::RawLifetimeOrLabel { name } => {
165+
FineTokenData::RawLifetimeOrLabel { name: name.clone() }
166+
}
161167
PretokenData::SingleQuoteLiteral {
162168
prefix,
163169
literal_content,

src/proptesting/strategies.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ pub const SIMPLE_STRATEGIES: &[(&str, &str)] = [
1414
("line-comment", r#"[/! a\n]{1,10}"#),
1515
("punctuation", r#"[-!#$%&*+,./:;<=>?@^_|~ ]{1,8}"#),
1616
("identifier", "[_#ra£áΩ🦀\x07\u{FFFF}. ]{1,12}"),
17+
("lifetime", "['#ra£🦀]{1,8}"),
1718
("string-literal", r#"[\\\n#'"rbcx _]{1,12}"#),
1819
("unicode-escape", r#""\\u\{.{0,8}[} ]""#),
1920
("hashed-raw", r#"(r|br|cr)#[\\\n#"rx _]{1,10}"#),
@@ -47,6 +48,7 @@ pub(crate) fn mix() -> BoxedStrategy<String> {
4748
r#"[/! a\n]{1,5}"#, // line-comment
4849
r#"[-!#$%&*+,./:;<=>?@^_|~ ]{1,5}"#, // punctuation
4950
"[_#ra£áΩ🦀\x07\u{FFFF}. ]{1,3}", // identifier
51+
"['#ra]{1,3}", // lifetime
5052
r#"[\\\n"'#rbcx ]{1,8}"#, // string-literal
5153
r#"[01][-+._012389abcdefghoxABCDEYZHOX]{1,8}"#, // numeric-literal
5254
"\0", // just a NUL

src/regular_tokens.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ pub enum RegularTokenData {
6767
LifetimeOrLabel {
6868
/// This includes the leading '
6969
symbol: Charseq,
70+
style: IdentifierStyle,
7071
},
7172
ByteLiteral {
7273
represented_byte: u8,
@@ -158,8 +159,12 @@ pub fn regularise_from_rustc(tokens: impl IntoIterator<Item = RustcToken>) -> Ve
158159
represented_identifier: identifier.into(),
159160
style: style.into(),
160161
},
161-
RustcTokenData::Lifetime { symbol: name } => RegularTokenData::LifetimeOrLabel {
162+
RustcTokenData::Lifetime {
163+
style,
164+
symbol: name,
165+
} => RegularTokenData::LifetimeOrLabel {
162166
symbol: name.into(),
167+
style: style.into(),
163168
},
164169
RustcTokenData::Lit { literal_data } => regularise_rustc_literal(literal_data)
165170
.expect("rustc token represented an error"),
@@ -296,6 +301,11 @@ fn from_coarse_token(token: CoarseToken) -> RegularTokenData {
296301
},
297302
CoarseTokenData::LifetimeOrLabel { name } => RegularTokenData::LifetimeOrLabel {
298303
symbol: once('\'').chain(name.iter().copied()).collect(),
304+
style: IdentifierStyle::NonRaw,
305+
},
306+
CoarseTokenData::RawLifetimeOrLabel { name } => RegularTokenData::LifetimeOrLabel {
307+
symbol: once('\'').chain(name.iter().copied()).collect(),
308+
style: IdentifierStyle::Raw,
299309
},
300310
CoarseTokenData::CharacterLiteral {
301311
represented_character,

src/testcases.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ pub const LONGLIST: &[&str] = [
362362
" r###############################################################################################################################################################################################################################################################\"aaa\"############################################################################################################################################################################################################################################################### ",
363363
" r################################################################################################################################################################################################################################################################\"aaa\"################################################################################################################################################################################################################################################################ ",
364364

365-
//// Single-quote forms
365+
//// Single-quote forms (both character literals and lifetimes/labels)
366366

367367
" '",
368368
" ''",
@@ -390,6 +390,21 @@ pub const LONGLIST: &[&str] = [
390390
" 'xxx'y'z",
391391
" 'xxx'yyy'z",
392392
" 'xxx'y y'z",
393+
" '_",
394+
" '#",
395+
" '##",
396+
" '#x",
397+
" '#x'y",
398+
" 'r#",
399+
" 'x#",
400+
" 'xx#",
401+
" 'r#x",
402+
" 'x#x",
403+
" 'xx#x",
404+
" '$#x",
405+
" '£#x",
406+
" 'r#x'y",
407+
" 'r#xx'y",
393408
" x'",
394409
" x''",
395410
" x'''",
@@ -459,6 +474,19 @@ pub const LONGLIST: &[&str] = [
459474
" _'yyy'z'",
460475
" _'y y'z'",
461476

477+
"'_",
478+
"'__",
479+
"'for",
480+
"'crate",
481+
"'self",
482+
"'super",
483+
"'Self",
484+
"'r#_",
485+
"'r#__",
486+
"'r#crate",
487+
"'r#self",
488+
"'r#super",
489+
"'r#Self",
462490

463491
//// Possible future forms
464492

@@ -472,6 +500,7 @@ pub const LONGLIST: &[&str] = [
472500
"'a\u{0301}",
473501
"'\u{e1}",
474502
"'Kelvin 'Kelvin",
503+
"'r#Kelvin 'r#Kelvin",
475504

476505
//// String escaping (prelexing behaviour)
477506

writeup/fine_grained_tokens.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Each fine-grained token has a <dfn>kind</dfn>, and possibly also some attributes
1515
| `Identifier` | <var>represented identifier</var> |
1616
| `RawIdentifier` | <var>represented identifier</var> |
1717
| `LifetimeOrLabel` | <var>name</var> |
18+
| `RawLifetimeOrLabel` | <var>name</var> |
1819
| `CharacterLiteral` | <var>represented character</var>, <var>suffix</var> |
1920
| `ByteLiteral` | <var>represented byte</var>, <var>suffix</var> |
2021
| `StringLiteral` | <var>represented string</var>, <var>suffix</var> |

0 commit comments

Comments
 (0)