Skip to content

Commit 353c77a

Browse files
committed
Correct \x lexing
As per PCRE, we eat up to 2 digits, with a fun behavior where no digits is interpreted as 0.
1 parent 35f9171 commit 353c77a

File tree

3 files changed

+25
-10
lines changed

3 files changed

+25
-10
lines changed

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ extension Source {
231231
/// UniScalar -> 'u{' HexDigit{1...} '}'
232232
/// | 'u' HexDigit{4}
233233
/// | 'x{' HexDigit{1...} '}'
234-
/// | 'x' HexDigit{2}
234+
/// | 'x' HexDigit{0...2}
235235
/// | 'U' HexDigit{8}
236236
/// | 'o{' OctalDigit{1...} '}'
237237
/// | OctalDigit{1...3}
@@ -240,15 +240,26 @@ extension Source {
240240
escapedCharacter base: Character
241241
) throws -> Located<Unicode.Scalar> {
242242
try recordLoc { src in
243+
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
243244
switch base {
244245
// Hex numbers.
245-
case "u", "x":
246-
if src.tryEat("{") {
247-
let str = try src.lexUntil(eating: "}").value
248-
return try Source.validateUnicodeScalar(str, .hex)
246+
case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
247+
let str = try src.lexUntil(eating: "}").value
248+
return try Source.validateUnicodeScalar(str, .hex)
249+
250+
case "x":
251+
// \x expects *up to* 2 digits.
252+
guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else {
253+
// In PCRE, \x without any valid hex digits is \u{0}.
254+
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
255+
// could be changed to throw an error if we had a parsing mode for
256+
// them.
257+
return Unicode.Scalar(0)
249258
}
250-
let numDigits = base == "u" ? 4 : 2
251-
return try src.expectUnicodeScalar(numDigits: numDigits).value
259+
return try Source.validateUnicodeScalar(digits.string, .hex)
260+
261+
case "u":
262+
return try src.expectUnicodeScalar(numDigits: 4).value
252263
case "U":
253264
return try src.expectUnicodeScalar(numDigits: 8).value
254265

Tests/RegexTests/LexTests.swift

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,6 @@ extension RegexTests {
8383
_ = scalars
8484
}
8585

86-
// FIXME:
87-
// diagnoseUniScalar(
88-
// "12ab", base: "x", expectedDigits: 2)
8986
diagnoseUniScalar(
9087
"12", base: "u", expectedDigits: 4)
9188
diagnoseUniScalar(

Tests/RegexTests/ParseTests.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,13 @@ extension RegexTests {
270270
parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
271271
parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
272272

273+
// We take *up to* the first two valid digits for \x. No valid digits is 0.
274+
parseTest(#"\x"#, scalar("\u{0}"))
275+
parseTest(#"\x5"#, scalar("\u{5}"))
276+
parseTest(#"\xX"#, concat(scalar("\u{0}"), "X"))
277+
parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X"))
278+
parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b"))
279+
273280
// MARK: Character classes
274281

275282
parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit)))

0 commit comments

Comments
 (0)