Skip to content

Commit b3c5db0

Browse files
authored
Update the spec test suite submodule (#1576)
* Update the spec test suite submodule This commit updates the spec test suite submodule and notably implements the extended syntax in the text format for quoted identifiers. Names such as `$"foo"` are not valid and equivalent to `$foo`. This doesn't yet update any `wasmprinter`-based printing to use this new syntax, it's just parsed to pass the various spec tests added. This additionally updates the lexer to have a first-class notion of an annotation instead of being lumped in with "reserved" tokens to avoid having to deal with things like `@"foo""bar"` which is a single reserved token but technically not an annotation. This is now modelled differently within the lexer as `TokenKind::Annotation` or `TokenKind::Reserved`. (and can have a better first-class error in the future). * Fix warning on nightly * Clarify difference in errors
1 parent 01bec9c commit b3c5db0

File tree

22 files changed

+571
-163
lines changed

22 files changed

+571
-163
lines changed

crates/wast/src/lexer.rs

Lines changed: 158 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use std::char;
3131
use std::fmt;
3232
use std::slice;
3333
use std::str;
34+
use std::str::Utf8Error;
3435

3536
/// A structure used to lex the s-expression syntax of WAT files.
3637
///
@@ -99,6 +100,12 @@ pub enum TokenKind {
99100
/// The payload here is the original source text.
100101
Keyword,
101102

103+
/// An annotation (like `@foo`).
104+
///
105+
/// All annotations start with `@` and the payload will be the name of the
106+
/// annotation.
107+
Annotation,
108+
102109
/// A reserved series of `idchar` symbols. Unknown what this is meant to be
103110
/// used for, you'll probably generate an error about an unexpected token.
104111
Reserved,
@@ -136,8 +143,15 @@ pub enum FloatKind {
136143
}
137144

138145
enum ReservedKind {
146+
/// "..."
139147
String,
148+
/// anything that's just a sequence of `idchars!()`
140149
Idchars,
150+
/// $"..."
151+
IdString,
152+
/// @"..."
153+
AnnotationString,
154+
/// everything else (a conglomeration of strings, idchars, etc)
141155
Reserved,
142156
}
143157

@@ -199,6 +213,16 @@ pub enum LexError {
199213
/// version to behave differently than the compiler-visible version, so
200214
/// these are simply rejected for now.
201215
ConfusingUnicode(char),
216+
217+
/// An invalid utf-8 sequence was found in a quoted identifier, such as
218+
/// `$"\ff"`.
219+
InvalidUtf8Id(Utf8Error),
220+
221+
/// An empty identifier was found, or a lone `$`.
222+
EmptyId,
223+
224+
/// An empty identifier was found, or a lone `@`.
225+
EmptyAnnotation,
202226
}
203227

204228
/// A sign token for an integer.
@@ -420,14 +444,21 @@ impl<'a> Lexer<'a> {
420444
if let Some(ret) = self.classify_number(src) {
421445
return Ok(Some(ret));
422446
// https://webassembly.github.io/spec/core/text/values.html#text-id
423-
} else if *c == b'$' && src.len() > 1 {
447+
} else if *c == b'$' {
424448
return Ok(Some(TokenKind::Id));
449+
// part of the WebAssembly/annotations proposal
450+
// (no online url yet)
451+
} else if *c == b'@' {
452+
return Ok(Some(TokenKind::Annotation));
425453
// https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
426454
} else if b'a' <= *c && *c <= b'z' {
427455
return Ok(Some(TokenKind::Keyword));
428456
}
429457
}
430458

459+
ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
460+
ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
461+
431462
// ... otherwise this was a conglomeration of idchars,
432463
// strings, or just idchars that don't match a prior rule,
433464
// meaning this falls through to the fallback `Reserved`
@@ -538,15 +569,15 @@ impl<'a> Lexer<'a> {
538569
/// eaten. The classification assists in determining what the actual token
539570
/// here eaten looks like.
540571
fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
541-
let mut idchars = false;
572+
let mut idchars = 0u32;
542573
let mut strings = 0u32;
543574
let start = *pos;
544575
while let Some(byte) = self.input.as_bytes().get(*pos) {
545576
match byte {
546577
// Normal `idchars` production which appends to the reserved
547578
// token that's being produced.
548579
idchars!() => {
549-
idchars = true;
580+
idchars += 1;
550581
*pos += 1;
551582
}
552583

@@ -575,9 +606,13 @@ impl<'a> Lexer<'a> {
575606
}
576607
let ret = &self.input[start..*pos];
577608
Ok(match (idchars, strings) {
578-
(false, 0) => unreachable!(),
579-
(false, 1) => (ReservedKind::String, ret),
580-
(true, 0) => (ReservedKind::Idchars, ret),
609+
(0, 0) => unreachable!(),
610+
(0, 1) => (ReservedKind::String, ret),
611+
(_, 0) => (ReservedKind::Idchars, ret),
612+
// Pattern match `@"..."` and `$"..."` for string-based
613+
// identifiers and annotations.
614+
(1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
615+
(1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
581616
_ => (ReservedKind::Reserved, ret),
582617
})
583618
}
@@ -813,6 +848,37 @@ impl<'a> Lexer<'a> {
813848
}
814849
}
815850

851+
/// Parses an id-or-string-based name from `it`.
852+
///
853+
/// Note that `it` should already have been lexed and this is just
854+
/// extracting the value. If the token lexed was `@a` then this should point
855+
/// to `a`.
856+
///
857+
/// This will automatically detect quoted syntax such as `@"..."` and the
858+
/// byte string will be parsed and validated as utf-8.
859+
///
860+
/// # Errors
861+
///
862+
/// Returns an error if a quoted byte string is found and contains invalid
863+
/// utf-8.
864+
fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
865+
if it.clone().next() == Some('"') {
866+
it.next();
867+
match Lexer::parse_str(it, true)? {
868+
Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
869+
Ok(s) => Ok(Cow::Borrowed(s)),
870+
Err(e) => Err(LexError::InvalidUtf8Id(e)),
871+
},
872+
Cow::Owned(bytes) => match String::from_utf8(bytes) {
873+
Ok(s) => Ok(Cow::Owned(s)),
874+
Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
875+
},
876+
}
877+
} else {
878+
Ok(Cow::Borrowed(it.as_str()))
879+
}
880+
}
881+
816882
fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
817883
let n = Lexer::hexdigit(it)?;
818884
let mut last_underscore = false;
@@ -878,28 +944,23 @@ impl<'a> Lexer<'a> {
878944
std::iter::from_fn(move || self.parse(&mut pos).transpose())
879945
}
880946

881-
/// Returns whether an annotation is present at `pos` and the name of the
882-
/// annotation.
883-
pub fn annotation(&self, mut pos: usize) -> Option<&'a str> {
947+
/// Returns whether an annotation is present at `pos`. If it is present then
948+
/// `Ok(Some(token))` is returned corresponding to the token, otherwise
949+
/// `Ok(None)` is returned. If the next token cannot be parsed then an error
950+
/// is returned.
951+
pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
884952
let bytes = self.input.as_bytes();
885953
// Quickly reject anything that for sure isn't an annotation since this
886954
// method is used every time an lparen is parsed.
887955
if bytes.get(pos) != Some(&b'@') {
888-
return None;
956+
return Ok(None);
889957
}
890-
match self.parse(&mut pos) {
891-
Ok(Some(token)) => {
892-
match token.kind {
893-
TokenKind::Reserved => {}
894-
_ => return None,
895-
}
896-
if token.len == 1 {
897-
None // just the `@` character isn't a valid annotation
898-
} else {
899-
Some(&token.src(self.input)[1..])
900-
}
901-
}
902-
Ok(None) | Err(_) => None,
958+
match self.parse(&mut pos)? {
959+
Some(token) => match token.kind {
960+
TokenKind::Annotation => Ok(Some(token)),
961+
_ => Ok(None),
962+
},
963+
None => Ok(None),
903964
}
904965
}
905966
}
@@ -913,9 +974,49 @@ impl Token {
913974
/// Returns the identifier, without the leading `$` symbol, that this token
914975
/// represents.
915976
///
977+
/// Note that this method returns the contents of the identifier. With a
978+
/// string-based identifier this means that escapes have been resolved to
979+
/// their string-based equivalent.
980+
///
916981
/// Should only be used with `TokenKind::Id`.
917-
pub fn id<'a>(&self, s: &'a str) -> &'a str {
918-
&self.src(s)[1..]
982+
///
983+
/// # Errors
984+
///
985+
/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
986+
/// which is invalid utf-8.
987+
pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
988+
let mut ch = self.src(s).chars();
989+
let dollar = ch.next();
990+
debug_assert_eq!(dollar, Some('$'));
991+
let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
992+
if id.is_empty() {
993+
return Err(self.error(s, LexError::EmptyId));
994+
}
995+
Ok(id)
996+
}
997+
998+
/// Returns the annotation, without the leading `@` symbol, that this token
999+
/// represents.
1000+
///
1001+
/// Note that this method returns the contents of the identifier. With a
1002+
/// string-based identifier this means that escapes have been resolved to
1003+
/// their string-based equivalent.
1004+
///
1005+
/// Should only be used with `TokenKind::Annotation`.
1006+
///
1007+
/// # Errors
1008+
///
1009+
/// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1010+
/// which is invalid utf-8.
1011+
pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1012+
let mut ch = self.src(s).chars();
1013+
let at = ch.next();
1014+
debug_assert_eq!(at, Some('@'));
1015+
let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
1016+
if id.is_empty() {
1017+
return Err(self.error(s, LexError::EmptyAnnotation));
1018+
}
1019+
Ok(id)
9191020
}
9201021

9211022
/// Returns the keyword this token represents.
@@ -1061,6 +1162,16 @@ impl Token {
10611162
val,
10621163
}
10631164
}
1165+
1166+
fn error(&self, src: &str, err: LexError) -> Error {
1167+
Error::lex(
1168+
Span {
1169+
offset: self.offset,
1170+
},
1171+
src,
1172+
err,
1173+
)
1174+
}
10641175
}
10651176

10661177
impl<'a> Integer<'a> {
@@ -1107,6 +1218,9 @@ impl fmt::Display for LexError {
11071218
InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
11081219
LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
11091220
ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
1221+
InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1222+
EmptyId => write!(f, "empty identifier")?,
1223+
EmptyAnnotation => write!(f, "empty annotation id")?,
11101224
}
11111225
Ok(())
11121226
}
@@ -1254,10 +1368,10 @@ mod tests {
12541368

12551369
#[test]
12561370
fn id() {
1257-
fn get_id(input: &str) -> &str {
1371+
fn get_id(input: &str) -> String {
12581372
let token = get_token(input);
12591373
match token.kind {
1260-
TokenKind::Id => token.id(input),
1374+
TokenKind::Id => token.id(input).unwrap().to_string(),
12611375
other => panic!("not id {:?}", other),
12621376
}
12631377
}
@@ -1267,6 +1381,23 @@ mod tests {
12671381
assert_eq!(get_id("$0^"), "0^");
12681382
assert_eq!(get_id("$0^;;"), "0^");
12691383
assert_eq!(get_id("$0^ ;;"), "0^");
1384+
assert_eq!(get_id("$\"x\" ;;"), "x");
1385+
}
1386+
1387+
#[test]
1388+
fn annotation() {
1389+
fn get_annotation(input: &str) -> String {
1390+
let token = get_token(input);
1391+
match token.kind {
1392+
TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1393+
other => panic!("not annotation {:?}", other),
1394+
}
1395+
}
1396+
assert_eq!(get_annotation("@foo"), "foo");
1397+
assert_eq!(get_annotation("@foo "), "foo");
1398+
assert_eq!(get_annotation("@f "), "f");
1399+
assert_eq!(get_annotation("@\"x\" "), "x");
1400+
assert_eq!(get_annotation("@0 "), "0");
12701401
}
12711402

12721403
#[test]
@@ -1294,7 +1425,6 @@ mod tests {
12941425
other => panic!("not reserved {:?}", other),
12951426
}
12961427
}
1297-
assert_eq!(get_reserved("$ "), "$");
12981428
assert_eq!(get_reserved("^_x "), "^_x");
12991429
}
13001430

crates/wast/src/lib.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ macro_rules! annotation {
317317
impl<'a> $crate::parser::Parse<'a> for $name {
318318
fn parse(parser: $crate::parser::Parser<'a>) -> $crate::parser::Result<Self> {
319319
parser.step(|c| {
320-
if let Some((a, rest)) = c.reserved()? {
321-
if a == concat!("@", $annotation) {
320+
if let Some((a, rest)) = c.annotation()? {
321+
if a == $annotation {
322322
return Ok(($name(c.cur_span()), rest));
323323
}
324324
}
@@ -329,8 +329,8 @@ macro_rules! annotation {
329329

330330
impl $crate::parser::Peek for $name {
331331
fn peek(cursor: $crate::parser::Cursor<'_>) -> $crate::parser::Result<bool> {
332-
Ok(if let Some((a, _rest)) = cursor.reserved()? {
333-
a == concat!("@", $annotation)
332+
Ok(if let Some((a, _rest)) = cursor.annotation()? {
333+
a == $annotation
334334
} else {
335335
false
336336
})

0 commit comments

Comments
 (0)