Skip to content

Commit e6ec173

Browse files
authored
Merge pull request #101 from hamishknight/solve-for-x
2 parents 49418d2 + 353c77a commit e6ec173

File tree

6 files changed

+34
-19
lines changed

6 files changed

+34
-19
lines changed

Sources/_MatchingEngine/Regex/AST/Group.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ extension AST {
2727
case nonCaptureReset
2828

2929
// (?>...)
30-
case atomicNonCapturing // TODO: is Oniguruma capturing?
30+
case atomicNonCapturing
3131

3232
// (?=...)
3333
case lookahead

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ extension Source {
231231
/// UniScalar -> 'u{' HexDigit{1...} '}'
232232
/// | 'u' HexDigit{4}
233233
/// | 'x{' HexDigit{1...} '}'
234-
/// | 'x' HexDigit{2}
234+
/// | 'x' HexDigit{0...2}
235235
/// | 'U' HexDigit{8}
236236
/// | 'o{' OctalDigit{1...} '}'
237237
/// | OctalDigit{1...3}
@@ -240,15 +240,26 @@ extension Source {
240240
escapedCharacter base: Character
241241
) throws -> Located<Unicode.Scalar> {
242242
try recordLoc { src in
243+
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
243244
switch base {
244245
// Hex numbers.
245-
case "u", "x":
246-
if src.tryEat("{") {
247-
let str = try src.lexUntil(eating: "}").value
248-
return try Source.validateUnicodeScalar(str, .hex)
246+
case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
247+
let str = try src.lexUntil(eating: "}").value
248+
return try Source.validateUnicodeScalar(str, .hex)
249+
250+
case "x":
251+
// \x expects *up to* 2 digits.
252+
guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else {
253+
// In PCRE, \x without any valid hex digits is \u{0}.
254+
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
255+
// could be changed to throw an error if we had a parsing mode for
256+
// them.
257+
return Unicode.Scalar(0)
249258
}
250-
let numDigits = base == "u" ? 4 : 2
251-
return try src.expectUnicodeScalar(numDigits: numDigits).value
259+
return try Source.validateUnicodeScalar(digits.string, .hex)
260+
261+
case "u":
262+
return try src.expectUnicodeScalar(numDigits: 4).value
252263
case "U":
253264
return try src.expectUnicodeScalar(numDigits: 8).value
254265

@@ -514,7 +525,7 @@ extension Source {
514525
/// Try to lex a sequence of matching options.
515526
///
516527
/// MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+
517-
/// | MatchingOption* '-' MatchingOption+
528+
/// | MatchingOption* '-' MatchingOption*
518529
///
519530
mutating func lexMatchingOptionSequence(
520531
) throws -> AST.MatchingOptionSequence? {
@@ -527,8 +538,8 @@ extension Source {
527538
adding.append(opt)
528539
}
529540

530-
// If the sequence begun with a caret '^', options can be added, so we're
531-
// done.
541+
// If the sequence begun with a caret '^', options can only be added, so
542+
// we're done.
532543
if ateCaret.value {
533544
return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil,
534545
removing: [])

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,6 @@ extension Parser {
252252
mutating func parseCCCMembers(
253253
into members: inout Array<CustomCC.Member>
254254
) throws {
255-
// FIXME: Track source locations
256-
257255
// Parse members until we see the end of the custom char class or an
258256
// operator.
259257
while source.peek() != "]" && source.peekCCBinOp() == nil {

Sources/_MatchingEngine/Utility/MissingUnicode.swift

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ extension Unicode {
240240
case spaceSeparator = "Zs"
241241
}
242242

243-
// A list of unicode properties that can either be true or false.
244-
// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
243+
/// A list of unicode properties that can either be true or false.
244+
/// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
245245
public enum BinaryProperty: String, Hashable {
246246
case asciiHexDigit = "ASCII_Hex_Digit"
247247
case alphabetic = "Alphabetic"
@@ -313,7 +313,9 @@ extension Unicode {
313313
}
314314
}
315315

316-
// Oniguruma properties that are not covered by the above.
316+
/// Oniguruma properties that are not covered by Unicode spellings.
317+
/// TODO: These should become aliases for the Block (blk) Unicode character
318+
/// property.
317319
public enum OnigurumaSpecialProperty: String, Hashable {
318320
case inBasicLatin = "In_Basic_Latin"
319321
case inLatin1Supplement = "In_Latin_1_Supplement"

Tests/RegexTests/LexTests.swift

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,6 @@ extension RegexTests {
8383
_ = scalars
8484
}
8585

86-
// FIXME:
87-
// diagnoseUniScalar(
88-
// "12ab", base: "x", expectedDigits: 2)
8986
diagnoseUniScalar(
9087
"12", base: "u", expectedDigits: 4)
9188
diagnoseUniScalar(

Tests/RegexTests/ParseTests.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,13 @@ extension RegexTests {
270270
parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
271271
parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
272272

273+
// We take *up to* the first two valid digits for \x. No valid digits is 0.
274+
parseTest(#"\x"#, scalar("\u{0}"))
275+
parseTest(#"\x5"#, scalar("\u{5}"))
276+
parseTest(#"\xX"#, concat(scalar("\u{0}"), "X"))
277+
parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X"))
278+
parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b"))
279+
273280
// MARK: Character classes
274281

275282
parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit)))

0 commit comments

Comments
 (0)