Finish up matchScalar

rctcwyvrn · rctcwyvrn · commit c2ee8cc430b9 · 2022-06-29T14:59:55.000-07:00
diff --git a/Sources/RegexBenchmark/BenchmarkRegistration.swift b/Sources/RegexBenchmark/BenchmarkRegistration.swift
@@ -18,6 +18,7 @@ extension BenchmarkRunner {
     benchmark.addCustomCharacterClasses()
     benchmark.addDna()
     benchmark.addUnicode()
+    benchmark.addLiteralSearch()
     // -- end of registrations --
     return benchmark
   }
diff --git a/Sources/RegexBenchmark/Suite/LiteralSearch.swift b/Sources/RegexBenchmark/Suite/LiteralSearch.swift
@@ -0,0 +1,10 @@
+import _StringProcessing
+
+extension BenchmarkRunner {
+  mutating func addLiteralSearch() {
+    let searchNotFound = CrossBenchmark(baseName: "LiteralSearchNotFound", regex: "magic_string_to_search_for", input: Inputs.dnaFASTA)
+    let search = CrossBenchmark(baseName: "LiteralSearch", regex: "aatcgaagcagtcttctaacacccttagaaaagcaaacactattgaatactgccgccgca", input: Inputs.dnaFASTA)
+    searchNotFound.register(&self)
+    search.register(&self)
+  }
+}
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -219,22 +219,7 @@ fileprivate extension Compiler.ByteCodeGen {
       return
     }
     
-//    if s.value < 0x300 {
-//      // lily todo: make sure this is correct + add compiler option check after it's merged in
-//
-//      // we unconditionally match against the scalar using consumeScalar in the else case
-//      // so maybe this check is uneccessary??
-//      // I thought having it be < 0x300 made sure we didn't have to worry about any combining stuff
-//      // but in the else case we just unconditionally consume and check the value
-//      // i think this is all redundant
-//      builder.buildMatchScalar(s, boundaryCheck: false)
-//      return
-//    }
-//
-//    builder.buildConsume(by: consumeScalar {
-//      $0 == s
-//    })
-    if optimizationsEnabled {
+    if optimizationsEnabled { // lily note: should we just do this unconditionally?
       builder.buildMatchScalar(s, boundaryCheck: false)
     } else {
       builder.buildConsume(by: consumeScalar {
@@ -263,21 +248,11 @@ fileprivate extension Compiler.ByteCodeGen {
       }
     }
     
-//    if c.unicodeScalars.count == 1,
-//        let first = c.unicodeScalars.first,
-//        first.value < 0x300 { // lily todo: check this more carefully
-      // if we have a single scalar then this must not be an extended grapheme cluster
-      // so it must be a character that can be exactly matched by its first scalar
-      // cr-lf has two scalars right? yes it has two
-      
-      // i think one these two checks are redundant, I think we only need the second?
-      // ask alex?
-    
-    // we can only match against characters that have a single cannonical equivalence
-    // so I think that rules out any latin in here, so just use ascii for now
-    // we also need to exclude our good non-single-scalar-ascii friend cr-lf
-    if optimizationsEnabled && c.isASCII && c != "\r\n" {
-      builder.buildMatchScalar(c.unicodeScalars.first!, boundaryCheck: true)
+    if optimizationsEnabled && c.isASCII {
+      for scalar in c.unicodeScalars {
+        let boundaryCheck = scalar == c.unicodeScalars.last!
+        builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
+      }
       return
     }
       
@@ -786,29 +761,14 @@ fileprivate extension Compiler.ByteCodeGen {
             return currentIndex
           }
         } else {
-          // if we have any extended latin in our characters then we have to
-          // respect cannoical equivalence, so we cannot match against scalars exactly
-          // so match against all single scalar ascii
-          
-          // lily todo: which strings are nfc invariant and matchable by direct scalar comparison?
-          // alternatively: loop over characters in s and emit either matchScalar or matchCharacter depending on if it is NFC invariant
-          // getting rid of matchSeq entirely does also get rid of the weird ARC
-          if optimizationsEnabled {
-            for c in s {
-              // Each character needs to be NFC invariant in order for us to match it directly by scalar value in grapheme cluster mode
-              // lily temp: use isASCII for now, ask alex what exactly this check should be
-              if c.isASCII && c != "\r\n" {
-                builder.buildMatchScalar(c.unicodeScalars.first!, boundaryCheck: false)
-              } else {
-                // let's think about this carefully
-                // what if our quoted literal is an ascii character + combining accent
-                // what are the characters in the loop?
-                
-                // I believe that if we ever have ascii + combining character in our input
-                // string will automatically combine them into a unified character, so itll fall into this case
-                
-                // so I don't think we ever need that boundaryCheck to be enabled, except at the end of this sequence
-                builder.buildMatch(c)
+          if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
+            for char in s {
+              // Note: only cr-lf is multiple scalars
+              for scalar in char.unicodeScalars {
+                // Only boundary check if we are the last scalar in the last character
+                // to make sure that there isn't a combining scalar after the quoted literal
+                let boundaryCheck = char == s.last! && scalar == char.unicodeScalars.last!
+                builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
               }
             }
           } else {
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -35,6 +35,7 @@ func _firstMatch(
   if validateOptimizations {
     regex._setCompilerOptionsForTesting(.disableOptimizations)
     guard let unoptResult = try regex.firstMatch(in: input) else {
+      XCTFail("Optimized regex for \(regexStr) matched on \(input) when unoptimized regex did not")
       throw MatchError("match not found for unoptimized \(regexStr) in \(input)")
     }
     XCTAssertEqual(
@@ -161,9 +162,10 @@ func firstMatchTest(
   } catch {
     // FIXME: This allows non-matches to succeed even when xfail'd
     // When xfail == true, this should report failure for match == nil
-    if !xfail && match != nil {
-      XCTFail("\(error)", file: file, line: line)
+    if xfail || (match == nil && error is MatchError) {
+      return
     }
+    XCTFail("\(error)", file: file, line: line)
     return
   }
 }
@@ -596,6 +598,12 @@ extension RegexTests {
               ("A", true),
               ("a", false))
 
+    matchTest(#"(?i)[a]"#,
+              ("💿", false),
+              ("a\u{301}", false),
+              ("A", true),
+              ("a", true))
+
     matchTest("[a]",
       ("a\u{301}", false))
 
@@ -1824,6 +1832,15 @@ extension RegexTests {
   
   // TODO: Add test for grapheme boundaries at start/end of match
 
+  func testScalarOptimization() throws {
+    // check that we are correctly doing the boundary check after matchScalar
+    firstMatchTest("a", input: "a\u{301}", match: nil)
+    firstMatchTest("aa", input: "aa\u{301}", match: nil)
+//    let regex = "aa"
+//    let input = "aa\u{301}"
+//    XCTAssertEqual(regex.firstMatch(of: input), nil)
+  }
+  
   func testCase() {
     let regex = try! Regex(#".\N{SPARKLING HEART}."#)
     let input = "🧟‍♀️💖🧠 or 🧠💖☕️"

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ extension BenchmarkRunner {`
`18`	`18`	`benchmark.addCustomCharacterClasses()`
`19`	`19`	`benchmark.addDna()`
`20`	`20`	`benchmark.addUnicode()`
	`21`	`+ benchmark.addLiteralSearch()`
`21`	`22`	`// -- end of registrations --`
`22`	`23`	`return benchmark`
`23`	`24`	`}`