Skip to content

Commit f71b852

Browse files
author
Jakub Bukaj
committed
rollup merge of #19103: huonw/literal-suffixes
Futureproof Rust for fancier suffixed literals. The Rust compiler tokenises a literal followed immediately (no whitespace) by an identifier as a single token: (for example) the text sequences `"foo"bar`, `1baz` and `1u1024` are now a single token rather than the pairs `"foo"` `bar`, `1` `baz` and `1u` `1024` respectively. The compiler rejects all such suffixes in the parser, except for the 12 numeric suffixes we have now. I'm fairly sure this will affect very few programs, since it's not currently legal to have `<literal><identifier>` in a Rust program, except in a macro invocation. Any macro invocation relying on this behaviour can simply separate the two tokens with whitespace: `foo!("bar"baz)` becomes `foo!("bar" baz)`. This implements [RFC 463](https://github.com/rust-lang/rfcs/blob/master/text/0463-future-proof-literal-suffixes.md), and so closes #19088.
2 parents 00ffcca + a11078f commit f71b852

File tree

15 files changed

+476
-357
lines changed

15 files changed

+476
-357
lines changed

mk/grammar.mk

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,25 @@ endef
3030
$(BG):
3131
$(Q)mkdir -p $(BG)
3232

33-
$(BG)RustLexer.class: $(SG)RustLexer.g4
33+
$(BG)RustLexer.class: $(BG) $(SG)RustLexer.g4
3434
$(Q)$(CFG_ANTLR4) -o $(B)grammar $(SG)RustLexer.g4
3535
$(Q)$(CFG_JAVAC) -d $(BG) $(BG)RustLexer.java
3636

37-
$(BG)verify: $(SG)verify.rs rustc-stage2-H-$(CFG_BUILD) $(LD)stamp.regex_macros $(LD)stamp.rustc
38-
$(Q)$(RUSTC) -O --out-dir $(BG) -L $(L) $(SG)verify.rs
37+
check-build-lexer-verifier: $(BG)verify
38+
39+
ifeq ($(NO_REBUILD),)
40+
VERIFY_DEPS := rustc-stage2-H-$(CFG_BUILD) $(LD)stamp.regex_macros $(LD)stamp.rustc
41+
else
42+
VERIFY_DEPS :=
43+
endif
44+
45+
$(BG)verify: $(BG) $(SG)verify.rs $(VERIFY_DEPS)
46+
$(Q)$(RUSTC) --out-dir $(BG) -L $(L) $(SG)verify.rs
3947

4048
ifdef CFG_JAVAC
4149
ifdef CFG_ANTLR4
4250
ifdef CFG_GRUN
43-
check-lexer: $(BG) $(BG)RustLexer.class $(BG)verify
51+
check-lexer: $(BG) $(BG)RustLexer.class check-build-lexer-verifier
4452
$(info Verifying libsyntax against the reference lexer ...)
4553
$(Q)$(SG)check.sh $(S) "$(BG)" \
4654
"$(CFG_GRUN)" "$(BG)verify" "$(BG)RustLexer.tokens"

mk/tests.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ check-docs: cleantestlibs cleantmptestlogs check-stage2-docs
199199

200200
# Some less critical tests that are not prone to breakage.
201201
# Not run as part of the normal test suite, but tested by bors on checkin.
202-
check-secondary: check-build-compiletest check-lexer check-pretty
202+
check-secondary: check-build-compiletest check-build-lexer-verifier check-lexer check-pretty
203203

204204
# check + check-secondary.
205205
#

src/doc/reference.md

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,15 @@ rather than referring to it by name or some other evaluation rule. A literal is
216216
a form of constant expression, so is evaluated (primarily) at compile time.
217217

218218
```{.ebnf .gram}
219-
literal : string_lit | char_lit | byte_string_lit | byte_lit | num_lit ;
219+
lit_suffix : ident;
220+
literal : [ string_lit | char_lit | byte_string_lit | byte_lit | num_lit ] lit_suffix ?;
220221
```
221222

223+
The optional suffix is only used for certain numeric literals, but is
224+
reserved for future extension, that is, the above gives the lexical
225+
grammar, but a Rust parser will reject everything but the 12 special
226+
cases mentioned in [Number literals](#number-literals) below.
227+
222228
#### Character and string literals
223229

224230
```{.ebnf .gram}
@@ -371,27 +377,20 @@ b"\\x52"; br"\x52"; // \x52
371377
#### Number literals
372378

373379
```{.ebnf .gram}
374-
num_lit : nonzero_dec [ dec_digit | '_' ] * num_suffix ?
375-
| '0' [ [ dec_digit | '_' ] * num_suffix ?
376-
| 'b' [ '1' | '0' | '_' ] + int_suffix ?
377-
| 'o' [ oct_digit | '_' ] + int_suffix ?
378-
| 'x' [ hex_digit | '_' ] + int_suffix ? ] ;
379-
380-
num_suffix : int_suffix | float_suffix ;
380+
num_lit : nonzero_dec [ dec_digit | '_' ] * float_suffix ?
381+
| '0' [ [ dec_digit | '_' ] * float_suffix ?
382+
| 'b' [ '1' | '0' | '_' ] +
383+
| 'o' [ oct_digit | '_' ] +
384+
| 'x' [ hex_digit | '_' ] + ] ;
381385
382-
int_suffix : 'u' int_suffix_size ?
383-
| 'i' int_suffix_size ? ;
384-
int_suffix_size : [ '8' | "16" | "32" | "64" ] ;
386+
float_suffix : [ exponent | '.' dec_lit exponent ? ] ? ;
385387
386-
float_suffix : [ exponent | '.' dec_lit exponent ? ] ? float_suffix_ty ? ;
387-
float_suffix_ty : 'f' [ "32" | "64" ] ;
388388
exponent : ['E' | 'e'] ['-' | '+' ] ? dec_lit ;
389389
dec_lit : [ dec_digit | '_' ] + ;
390390
```
391391

392392
A _number literal_ is either an _integer literal_ or a _floating-point
393-
literal_. The grammar for recognizing the two kinds of literals is mixed, as
394-
they are differentiated by suffixes.
393+
literal_. The grammar for recognizing the two kinds of literals is mixed.
395394

396395
##### Integer literals
397396

@@ -406,9 +405,9 @@ An _integer literal_ has one of four forms:
406405
* A _binary literal_ starts with the character sequence `U+0030` `U+0062`
407406
(`0b`) and continues as any mixture of binary digits and underscores.
408407

409-
An integer literal may be followed (immediately, without any spaces) by an
410-
_integer suffix_, which changes the type of the literal. There are two kinds of
411-
integer literal suffix:
408+
Like any literal, an integer literal may be followed (immediately,
409+
without any spaces) by an _integer suffix_, which forcibly sets the
410+
type of the literal. There are 10 valid values for an integer suffix:
412411

413412
* The `i` and `u` suffixes give the literal type `int` or `uint`,
414413
respectively.
@@ -443,11 +442,9 @@ A _floating-point literal_ has one of two forms:
443442
* A single _decimal literal_ followed by an _exponent_.
444443

445444
By default, a floating-point literal has a generic type, and, like integer
446-
literals, the type must be uniquely determined from the context. A
447-
floating-point literal may be followed (immediately, without any spaces) by a
448-
_floating-point suffix_, which changes the type of the literal. There are two
449-
floating-point suffixes: `f32`, and `f64` (the 32-bit and 64-bit floating point
450-
types).
445+
literals, the type must be uniquely determined from the context. There are two valid
446+
_floating-point suffixes_, `f32` and `f64` (the 32-bit and 64-bit floating point
447+
types), which explicitly determine the type of the literal.
451448

452449
Examples of floating-point literals of various forms:
453450

src/grammar/RustLexer.g4

Lines changed: 15 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -92,49 +92,35 @@ fragment CHAR_ESCAPE
9292
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
9393
;
9494
95-
LIT_CHAR
96-
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\''
95+
fragment SUFFIX
96+
: IDENT
9797
;
9898
99-
LIT_BYTE
100-
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\''
99+
LIT_CHAR
100+
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
101101
;
102102

103-
fragment INT_SUFFIX
104-
: 'i'
105-
| 'i8'
106-
| 'i16'
107-
| 'i32'
108-
| 'i64'
109-
| 'u'
110-
| 'u8'
111-
| 'u16'
112-
| 'u32'
113-
| 'u64'
103+
LIT_BYTE
104+
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
114105
;
115106

116107
LIT_INTEGER
117-
: [0-9][0-9_]* INT_SUFFIX?
118-
| '0b' [01][01_]* INT_SUFFIX?
119-
| '0o' [0-7][0-7_]* INT_SUFFIX?
120-
| '0x' [0-9a-fA-F][0-9a-fA-F_]* INT_SUFFIX?
121-
;
122-
123-
fragment FLOAT_SUFFIX
124-
: 'f32'
125-
| 'f64'
108+
: [0-9][0-9_]* SUFFIX?
109+
| '0b' [01][01_]* SUFFIX?
110+
| '0o' [0-7][0-7_]* SUFFIX?
111+
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?
126112
;
127113

128114
LIT_FLOAT
129-
: [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? FLOAT_SUFFIX?)
115+
: [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
130116
;
131117

132118
LIT_STR
133-
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"'
119+
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
134120
;
135121

136-
LIT_BINARY : 'b' LIT_STR ;
137-
LIT_BINARY_RAW : 'rb' LIT_STR_RAW ;
122+
LIT_BINARY : 'b' LIT_STR SUFFIX?;
123+
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
138124

139125
/* this is a bit messy */
140126

@@ -148,7 +134,7 @@ fragment LIT_STR_RAW_INNER2
148134
;
149135

150136
LIT_STR_RAW
151-
: 'r' LIT_STR_RAW_INNER
137+
: 'r' LIT_STR_RAW_INNER SUFFIX?
152138
;
153139

154140
IDENT : XID_start XID_continue* ;

0 commit comments

Comments
 (0)