Merge pull request #101 from hamishknight/solve-for-x

hamishknight · web-flow · commit e6ec17368e9c · 2022-01-10T15:03:13.000Z
diff --git a/Sources/_MatchingEngine/Regex/AST/Group.swift b/Sources/_MatchingEngine/Regex/AST/Group.swift
@@ -27,7 +27,7 @@ extension AST {
       case nonCaptureReset
 
       // (?>...)
-      case atomicNonCapturing // TODO: is Oniguruma capturing?
+      case atomicNonCapturing
 
       // (?=...)
       case lookahead
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -231,7 +231,7 @@ extension Source {
   ///     UniScalar -> 'u{' HexDigit{1...} '}'
   ///                | 'u'  HexDigit{4}
   ///                | 'x{' HexDigit{1...} '}'
-  ///                | 'x'  HexDigit{2}
+  ///                | 'x'  HexDigit{0...2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
   ///                | OctalDigit{1...3}
@@ -240,15 +240,26 @@ extension Source {
     escapedCharacter base: Character
   ) throws -> Located<Unicode.Scalar> {
     try recordLoc { src in
+      // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
       switch base {
       // Hex numbers.
-      case "u", "x":
-        if src.tryEat("{") {
-          let str = try src.lexUntil(eating: "}").value
-          return try Source.validateUnicodeScalar(str, .hex)
+      case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
+        let str = try src.lexUntil(eating: "}").value
+        return try Source.validateUnicodeScalar(str, .hex)
+
+      case "x":
+        // \x expects *up to* 2 digits.
+        guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else {
+          // In PCRE, \x without any valid hex digits is \u{0}.
+          // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
+          // could be changed to throw an error if we had a parsing mode for
+          // them.
+          return Unicode.Scalar(0)
         }
-        let numDigits = base == "u" ? 4 : 2
-        return try src.expectUnicodeScalar(numDigits: numDigits).value
+        return try Source.validateUnicodeScalar(digits.string, .hex)
+
+      case "u":
+        return try src.expectUnicodeScalar(numDigits: 4).value
       case "U":
         return try src.expectUnicodeScalar(numDigits: 8).value
 
@@ -514,7 +525,7 @@ extension Source {
   /// Try to lex a sequence of matching options.
   ///
   ///     MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+
-  ///                        | MatchingOption* '-' MatchingOption+
+  ///                        | MatchingOption* '-' MatchingOption*
   ///
   mutating func lexMatchingOptionSequence(
   ) throws -> AST.MatchingOptionSequence? {
@@ -527,8 +538,8 @@ extension Source {
       adding.append(opt)
     }
 
-    // If the sequence begun with a caret '^', options can be added, so we're
-    // done.
+    // If the sequence begun with a caret '^', options can only be added, so
+    // we're done.
     if ateCaret.value {
       return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil,
                    removing: [])
diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift
@@ -252,8 +252,6 @@ extension Parser {
   mutating func parseCCCMembers(
     into members: inout Array<CustomCC.Member>
   ) throws {
-    // FIXME: Track source locations
-
     // Parse members until we see the end of the custom char class or an
     // operator.
     while source.peek() != "]" && source.peekCCBinOp() == nil {
diff --git a/Sources/_MatchingEngine/Utility/MissingUnicode.swift b/Sources/_MatchingEngine/Utility/MissingUnicode.swift
@@ -240,8 +240,8 @@ extension Unicode {
     case spaceSeparator = "Zs"
   }
 
-  // A list of unicode properties that can either be true or false.
-  // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
+  /// A list of unicode properties that can either be true or false.
+  /// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
   public enum BinaryProperty: String, Hashable {
     case asciiHexDigit = "ASCII_Hex_Digit"
     case alphabetic = "Alphabetic"
@@ -313,7 +313,9 @@ extension Unicode {
   }
 }
 
-// Oniguruma properties that are not covered by the above.
+/// Oniguruma properties that are not covered by Unicode spellings.
+/// TODO: These should become aliases for the Block (blk) Unicode character
+/// property.
 public enum OnigurumaSpecialProperty: String, Hashable {
   case inBasicLatin = "In_Basic_Latin"
   case inLatin1Supplement = "In_Latin_1_Supplement"
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
@@ -83,9 +83,6 @@ extension RegexTests {
       _ = scalars
     }
 
-// FIXME:
-//    diagnoseUniScalar(
-//      "12ab", base: "x", expectedDigits: 2)
     diagnoseUniScalar(
       "12", base: "u", expectedDigits: 4)
     diagnoseUniScalar(
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -270,6 +270,13 @@ extension RegexTests {
     parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
     parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
 
+    // We take *up to* the first two valid digits for \x. No valid digits is 0.
+    parseTest(#"\x"#, scalar("\u{0}"))
+    parseTest(#"\x5"#, scalar("\u{5}"))
+    parseTest(#"\xX"#, concat(scalar("\u{0}"), "X"))
+    parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X"))
+    parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b"))
+
     // MARK: Character classes
 
     parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit)))

Original file line number	Diff line number	Diff line change
`@@ -240,8 +240,8 @@ extension Unicode {`
`240`	`240`	`case spaceSeparator = "Zs"`
`241`	`241`	`}`
`242`	`242`
`243`		`- // A list of unicode properties that can either be true or false.`
`244`		`- // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt`
	`243`	`+ /// A list of unicode properties that can either be true or false.`
	`244`	`+ /// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt`
`245`	`245`	`public enum BinaryProperty: String, Hashable {`
`246`	`246`	`case asciiHexDigit = "ASCII_Hex_Digit"`
`247`	`247`	`case alphabetic = "Alphabetic"`
`@@ -313,7 +313,9 @@ extension Unicode {`
`313`	`313`	`}`
`314`	`314`	`}`
`315`	`315`
`316`		`-// Oniguruma properties that are not covered by the above.`
	`316`	`+/// Oniguruma properties that are not covered by Unicode spellings.`
	`317`	`+/// TODO: These should become aliases for the Block (blk) Unicode character`
	`318`	`+/// property.`
`317`	`319`	`public enum OnigurumaSpecialProperty: String, Hashable {`
`318`	`320`	`case inBasicLatin = "In_Basic_Latin"`
`319`	`321`	`case inLatin1Supplement = "In_Latin_1_Supplement"`
Original file line number	Diff line number	Diff line change
`@@ -83,9 +83,6 @@ extension RegexTests {`
`83`	`83`	`_ = scalars`
`84`	`84`	`}`
`85`	`85`
`86`		`-// FIXME:`
`87`		`-// diagnoseUniScalar(`
`88`		`-// "12ab", base: "x", expectedDigits: 2)`
`89`	`86`	`diagnoseUniScalar(`
`90`	`87`	`"12", base: "u", expectedDigits: 4)`
`91`	`88`	`diagnoseUniScalar(`