@@ -31,6 +31,7 @@ use std::char;
31
31
use std:: fmt;
32
32
use std:: slice;
33
33
use std:: str;
34
+ use std:: str:: Utf8Error ;
34
35
35
36
/// A structure used to lex the s-expression syntax of WAT files.
36
37
///
@@ -99,6 +100,12 @@ pub enum TokenKind {
99
100
/// The payload here is the original source text.
100
101
Keyword ,
101
102
103
+ /// An annotation (like `@foo`).
104
+ ///
105
+ /// All annotations start with `@` and the payload will be the name of the
106
+ /// annotation.
107
+ Annotation ,
108
+
102
109
/// A reserved series of `idchar` symbols. Unknown what this is meant to be
103
110
/// used for, you'll probably generate an error about an unexpected token.
104
111
Reserved ,
@@ -136,8 +143,15 @@ pub enum FloatKind {
136
143
}
137
144
138
145
enum ReservedKind {
146
+ /// "..."
139
147
String ,
148
+ /// anything that's just a sequence of `idchars!()`
140
149
Idchars ,
150
+ /// $"..."
151
+ IdString ,
152
+ /// @"..."
153
+ AnnotationString ,
154
+ /// everything else (a conglomeration of strings, idchars, etc)
141
155
Reserved ,
142
156
}
143
157
@@ -199,6 +213,16 @@ pub enum LexError {
199
213
/// version to behave differently than the compiler-visible version, so
200
214
/// these are simply rejected for now.
201
215
ConfusingUnicode ( char ) ,
216
+
217
+ /// An invalid utf-8 sequence was found in a quoted identifier, such as
218
+ /// `$"\ff"`.
219
+ InvalidUtf8Id ( Utf8Error ) ,
220
+
221
+ /// An empty identifier was found, or a lone `$`.
222
+ EmptyId ,
223
+
224
+ /// An empty identifier was found, or a lone `@`.
225
+ EmptyAnnotation ,
202
226
}
203
227
204
228
/// A sign token for an integer.
@@ -420,14 +444,21 @@ impl<'a> Lexer<'a> {
420
444
if let Some ( ret) = self . classify_number ( src) {
421
445
return Ok ( Some ( ret) ) ;
422
446
// https://webassembly.github.io/spec/core/text/values.html#text-id
423
- } else if * c == b'$' && src . len ( ) > 1 {
447
+ } else if * c == b'$' {
424
448
return Ok ( Some ( TokenKind :: Id ) ) ;
449
+ // part of the WebAssembly/annotations proposal
450
+ // (no online url yet)
451
+ } else if * c == b'@' {
452
+ return Ok ( Some ( TokenKind :: Annotation ) ) ;
425
453
// https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
426
454
} else if b'a' <= * c && * c <= b'z' {
427
455
return Ok ( Some ( TokenKind :: Keyword ) ) ;
428
456
}
429
457
}
430
458
459
+ ReservedKind :: IdString => return Ok ( Some ( TokenKind :: Id ) ) ,
460
+ ReservedKind :: AnnotationString => return Ok ( Some ( TokenKind :: Annotation ) ) ,
461
+
431
462
// ... otherwise this was a conglomeration of idchars,
432
463
// strings, or just idchars that don't match a prior rule,
433
464
// meaning this falls through to the fallback `Reserved`
@@ -538,15 +569,15 @@ impl<'a> Lexer<'a> {
538
569
/// eaten. The classification assists in determining what the actual token
539
570
/// here eaten looks like.
540
571
fn parse_reserved ( & self , pos : & mut usize ) -> Result < ( ReservedKind , & ' a str ) , Error > {
541
- let mut idchars = false ;
572
+ let mut idchars = 0u32 ;
542
573
let mut strings = 0u32 ;
543
574
let start = * pos;
544
575
while let Some ( byte) = self . input . as_bytes ( ) . get ( * pos) {
545
576
match byte {
546
577
// Normal `idchars` production which appends to the reserved
547
578
// token that's being produced.
548
579
idchars ! ( ) => {
549
- idchars = true ;
580
+ idchars += 1 ;
550
581
* pos += 1 ;
551
582
}
552
583
@@ -575,9 +606,13 @@ impl<'a> Lexer<'a> {
575
606
}
576
607
let ret = & self . input [ start..* pos] ;
577
608
Ok ( match ( idchars, strings) {
578
- ( false , 0 ) => unreachable ! ( ) ,
579
- ( false , 1 ) => ( ReservedKind :: String , ret) ,
580
- ( true , 0 ) => ( ReservedKind :: Idchars , ret) ,
609
+ ( 0 , 0 ) => unreachable ! ( ) ,
610
+ ( 0 , 1 ) => ( ReservedKind :: String , ret) ,
611
+ ( _, 0 ) => ( ReservedKind :: Idchars , ret) ,
612
+ // Pattern match `@"..."` and `$"..."` for string-based
613
+ // identifiers and annotations.
614
+ ( 1 , 1 ) if ret. starts_with ( "$" ) => ( ReservedKind :: IdString , ret) ,
615
+ ( 1 , 1 ) if ret. starts_with ( "@" ) => ( ReservedKind :: AnnotationString , ret) ,
581
616
_ => ( ReservedKind :: Reserved , ret) ,
582
617
} )
583
618
}
@@ -813,6 +848,37 @@ impl<'a> Lexer<'a> {
813
848
}
814
849
}
815
850
851
+ /// Parses an id-or-string-based name from `it`.
852
+ ///
853
+ /// Note that `it` should already have been lexed and this is just
854
+ /// extracting the value. If the token lexed was `@a` then this should point
855
+ /// to `a`.
856
+ ///
857
+ /// This will automatically detect quoted syntax such as `@"..."` and the
858
+ /// byte string will be parsed and validated as utf-8.
859
+ ///
860
+ /// # Errors
861
+ ///
862
+ /// Returns an error if a quoted byte string is found and contains invalid
863
+ /// utf-8.
864
+ fn parse_name ( it : & mut str:: Chars < ' a > ) -> Result < Cow < ' a , str > , LexError > {
865
+ if it. clone ( ) . next ( ) == Some ( '"' ) {
866
+ it. next ( ) ;
867
+ match Lexer :: parse_str ( it, true ) ? {
868
+ Cow :: Borrowed ( bytes) => match std:: str:: from_utf8 ( bytes) {
869
+ Ok ( s) => Ok ( Cow :: Borrowed ( s) ) ,
870
+ Err ( e) => Err ( LexError :: InvalidUtf8Id ( e) ) ,
871
+ } ,
872
+ Cow :: Owned ( bytes) => match String :: from_utf8 ( bytes) {
873
+ Ok ( s) => Ok ( Cow :: Owned ( s) ) ,
874
+ Err ( e) => Err ( LexError :: InvalidUtf8Id ( e. utf8_error ( ) ) ) ,
875
+ } ,
876
+ }
877
+ } else {
878
+ Ok ( Cow :: Borrowed ( it. as_str ( ) ) )
879
+ }
880
+ }
881
+
816
882
fn hexnum ( it : & mut str:: Chars < ' _ > ) -> Result < u32 , LexError > {
817
883
let n = Lexer :: hexdigit ( it) ?;
818
884
let mut last_underscore = false ;
@@ -878,28 +944,23 @@ impl<'a> Lexer<'a> {
878
944
std:: iter:: from_fn ( move || self . parse ( & mut pos) . transpose ( ) )
879
945
}
880
946
881
- /// Returns whether an annotation is present at `pos` and the name of the
882
- /// annotation.
883
- pub fn annotation ( & self , mut pos : usize ) -> Option < & ' a str > {
947
+ /// Returns whether an annotation is present at `pos`. If it is present then
948
+ /// `Ok(Some(token))` is returned corresponding to the token, otherwise
949
+ /// `Ok(None)` is returned. If the next token cannot be parsed then an error
950
+ /// is returned.
951
+ pub fn annotation ( & self , mut pos : usize ) -> Result < Option < Token > , Error > {
884
952
let bytes = self . input . as_bytes ( ) ;
885
953
// Quickly reject anything that for sure isn't an annotation since this
886
954
// method is used every time an lparen is parsed.
887
955
if bytes. get ( pos) != Some ( & b'@' ) {
888
- return None ;
956
+ return Ok ( None ) ;
889
957
}
890
- match self . parse ( & mut pos) {
891
- Ok ( Some ( token) ) => {
892
- match token. kind {
893
- TokenKind :: Reserved => { }
894
- _ => return None ,
895
- }
896
- if token. len == 1 {
897
- None // just the `@` character isn't a valid annotation
898
- } else {
899
- Some ( & token. src ( self . input ) [ 1 ..] )
900
- }
901
- }
902
- Ok ( None ) | Err ( _) => None ,
958
+ match self . parse ( & mut pos) ? {
959
+ Some ( token) => match token. kind {
960
+ TokenKind :: Annotation => Ok ( Some ( token) ) ,
961
+ _ => Ok ( None ) ,
962
+ } ,
963
+ None => Ok ( None ) ,
903
964
}
904
965
}
905
966
}
@@ -913,9 +974,49 @@ impl Token {
913
974
/// Returns the identifier, without the leading `$` symbol, that this token
914
975
/// represents.
915
976
///
977
+ /// Note that this method returns the contents of the identifier. With a
978
+ /// string-based identifier this means that escapes have been resolved to
979
+ /// their string-based equivalent.
980
+ ///
916
981
/// Should only be used with `TokenKind::Id`.
917
- pub fn id < ' a > ( & self , s : & ' a str ) -> & ' a str {
918
- & self . src ( s) [ 1 ..]
982
+ ///
983
+ /// # Errors
984
+ ///
985
+ /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
986
+ /// which is invalid utf-8.
987
+ pub fn id < ' a > ( & self , s : & ' a str ) -> Result < Cow < ' a , str > , Error > {
988
+ let mut ch = self . src ( s) . chars ( ) ;
989
+ let dollar = ch. next ( ) ;
990
+ debug_assert_eq ! ( dollar, Some ( '$' ) ) ;
991
+ let id = Lexer :: parse_name ( & mut ch) . map_err ( |e| self . error ( s, e) ) ?;
992
+ if id. is_empty ( ) {
993
+ return Err ( self . error ( s, LexError :: EmptyId ) ) ;
994
+ }
995
+ Ok ( id)
996
+ }
997
+
998
+ /// Returns the annotation, without the leading `@` symbol, that this token
999
+ /// represents.
1000
+ ///
1001
+ /// Note that this method returns the contents of the identifier. With a
1002
+ /// string-based identifier this means that escapes have been resolved to
1003
+ /// their string-based equivalent.
1004
+ ///
1005
+ /// Should only be used with `TokenKind::Annotation`.
1006
+ ///
1007
+ /// # Errors
1008
+ ///
1009
+ /// Returns an error if this is a string-based identifier (e.g. `$"..."`)
1010
+ /// which is invalid utf-8.
1011
+ pub fn annotation < ' a > ( & self , s : & ' a str ) -> Result < Cow < ' a , str > , Error > {
1012
+ let mut ch = self . src ( s) . chars ( ) ;
1013
+ let at = ch. next ( ) ;
1014
+ debug_assert_eq ! ( at, Some ( '@' ) ) ;
1015
+ let id = Lexer :: parse_name ( & mut ch) . map_err ( |e| self . error ( s, e) ) ?;
1016
+ if id. is_empty ( ) {
1017
+ return Err ( self . error ( s, LexError :: EmptyAnnotation ) ) ;
1018
+ }
1019
+ Ok ( id)
919
1020
}
920
1021
921
1022
/// Returns the keyword this token represents.
@@ -1061,6 +1162,16 @@ impl Token {
1061
1162
val,
1062
1163
}
1063
1164
}
1165
+
1166
+ fn error ( & self , src : & str , err : LexError ) -> Error {
1167
+ Error :: lex (
1168
+ Span {
1169
+ offset : self . offset ,
1170
+ } ,
1171
+ src,
1172
+ err,
1173
+ )
1174
+ }
1064
1175
}
1065
1176
1066
1177
impl < ' a > Integer < ' a > {
@@ -1107,6 +1218,9 @@ impl fmt::Display for LexError {
1107
1218
InvalidUnicodeValue ( c) => write ! ( f, "invalid unicode scalar value 0x{:x}" , c) ?,
1108
1219
LoneUnderscore => write ! ( f, "bare underscore in numeric literal" ) ?,
1109
1220
ConfusingUnicode ( c) => write ! ( f, "likely-confusing unicode character found {:?}" , c) ?,
1221
+ InvalidUtf8Id ( _) => write ! ( f, "malformed UTF-8 encoding of string-based id" ) ?,
1222
+ EmptyId => write ! ( f, "empty identifier" ) ?,
1223
+ EmptyAnnotation => write ! ( f, "empty annotation id" ) ?,
1110
1224
}
1111
1225
Ok ( ( ) )
1112
1226
}
@@ -1254,10 +1368,10 @@ mod tests {
1254
1368
1255
1369
#[ test]
1256
1370
fn id ( ) {
1257
- fn get_id ( input : & str ) -> & str {
1371
+ fn get_id ( input : & str ) -> String {
1258
1372
let token = get_token ( input) ;
1259
1373
match token. kind {
1260
- TokenKind :: Id => token. id ( input) ,
1374
+ TokenKind :: Id => token. id ( input) . unwrap ( ) . to_string ( ) ,
1261
1375
other => panic ! ( "not id {:?}" , other) ,
1262
1376
}
1263
1377
}
@@ -1267,6 +1381,23 @@ mod tests {
1267
1381
assert_eq ! ( get_id( "$0^" ) , "0^" ) ;
1268
1382
assert_eq ! ( get_id( "$0^;;" ) , "0^" ) ;
1269
1383
assert_eq ! ( get_id( "$0^ ;;" ) , "0^" ) ;
1384
+ assert_eq ! ( get_id( "$\" x\" ;;" ) , "x" ) ;
1385
+ }
1386
+
1387
+ #[ test]
1388
+ fn annotation ( ) {
1389
+ fn get_annotation ( input : & str ) -> String {
1390
+ let token = get_token ( input) ;
1391
+ match token. kind {
1392
+ TokenKind :: Annotation => token. annotation ( input) . unwrap ( ) . to_string ( ) ,
1393
+ other => panic ! ( "not annotation {:?}" , other) ,
1394
+ }
1395
+ }
1396
+ assert_eq ! ( get_annotation( "@foo" ) , "foo" ) ;
1397
+ assert_eq ! ( get_annotation( "@foo " ) , "foo" ) ;
1398
+ assert_eq ! ( get_annotation( "@f " ) , "f" ) ;
1399
+ assert_eq ! ( get_annotation( "@\" x\" " ) , "x" ) ;
1400
+ assert_eq ! ( get_annotation( "@0 " ) , "0" ) ;
1270
1401
}
1271
1402
1272
1403
#[ test]
@@ -1294,7 +1425,6 @@ mod tests {
1294
1425
other => panic ! ( "not reserved {:?}" , other) ,
1295
1426
}
1296
1427
}
1297
- assert_eq ! ( get_reserved( "$ " ) , "$" ) ;
1298
1428
assert_eq ! ( get_reserved( "^_x " ) , "^_x" ) ;
1299
1429
}
1300
1430
0 commit comments