Skip to content

Commit c2ee8cc

Browse files
committed
Finish up matchScalar
1 parent df8919e commit c2ee8cc

File tree

4 files changed

+44
-56
lines changed

4 files changed

+44
-56
lines changed

Sources/RegexBenchmark/BenchmarkRegistration.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ extension BenchmarkRunner {
1818
benchmark.addCustomCharacterClasses()
1919
benchmark.addDna()
2020
benchmark.addUnicode()
21+
benchmark.addLiteralSearch()
2122
// -- end of registrations --
2223
return benchmark
2324
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import _StringProcessing
2+
3+
extension BenchmarkRunner {
4+
mutating func addLiteralSearch() {
5+
let searchNotFound = CrossBenchmark(baseName: "LiteralSearchNotFound", regex: "magic_string_to_search_for", input: Inputs.dnaFASTA)
6+
let search = CrossBenchmark(baseName: "LiteralSearch", regex: "aatcgaagcagtcttctaacacccttagaaaagcaaacactattgaatactgccgccgca", input: Inputs.dnaFASTA)
7+
searchNotFound.register(&self)
8+
search.register(&self)
9+
}
10+
}

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -219,22 +219,7 @@ fileprivate extension Compiler.ByteCodeGen {
219219
return
220220
}
221221

222-
// if s.value < 0x300 {
223-
// // lily todo: make sure this is correct + add compiler option check after it's merged in
224-
//
225-
// // we unconditionally match against the scalar using consumeScalar in the else case
226-
// // so maybe this check is uneccessary??
227-
// // I thought having it be < 0x300 made sure we didn't have to worry about any combining stuff
228-
// // but in the else case we just unconditionally consume and check the value
229-
// // i think this is all redundant
230-
// builder.buildMatchScalar(s, boundaryCheck: false)
231-
// return
232-
// }
233-
//
234-
// builder.buildConsume(by: consumeScalar {
235-
// $0 == s
236-
// })
237-
if optimizationsEnabled {
222+
if optimizationsEnabled { // lily note: should we just do this unconditionally?
238223
builder.buildMatchScalar(s, boundaryCheck: false)
239224
} else {
240225
builder.buildConsume(by: consumeScalar {
@@ -263,21 +248,11 @@ fileprivate extension Compiler.ByteCodeGen {
263248
}
264249
}
265250

266-
// if c.unicodeScalars.count == 1,
267-
// let first = c.unicodeScalars.first,
268-
// first.value < 0x300 { // lily todo: check this more carefully
269-
// if we have a single scalar then this must not be an extended grapheme cluster
270-
// so it must be a character that can be exactly matched by its first scalar
271-
// cr-lf has two scalars right? yes it has two
272-
273-
// i think one these two checks are redundant, I think we only need the second?
274-
// ask alex?
275-
276-
// we can only match against characters that have a single cannonical equivalence
277-
// so I think that rules out any latin in here, so just use ascii for now
278-
// we also need to exclude our good non-single-scalar-ascii friend cr-lf
279-
if optimizationsEnabled && c.isASCII && c != "\r\n" {
280-
builder.buildMatchScalar(c.unicodeScalars.first!, boundaryCheck: true)
251+
if optimizationsEnabled && c.isASCII {
252+
for scalar in c.unicodeScalars {
253+
let boundaryCheck = scalar == c.unicodeScalars.last!
254+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
255+
}
281256
return
282257
}
283258

@@ -786,29 +761,14 @@ fileprivate extension Compiler.ByteCodeGen {
786761
return currentIndex
787762
}
788763
} else {
789-
// if we have any extended latin in our characters then we have to
790-
// respect cannoical equivalence, so we cannot match against scalars exactly
791-
// so match against all single scalar ascii
792-
793-
// lily todo: which strings are nfc invariant and matchable by direct scalar comparison?
794-
// alternatively: loop over characters in s and emit either matchScalar or matchCharacter depending on if it is NFC invariant
795-
// getting rid of matchSeq entirely does also get rid of the weird ARC
796-
if optimizationsEnabled {
797-
for c in s {
798-
// Each character needs to be NFC invariant in order for us to match it directly by scalar value in grapheme cluster mode
799-
// lily temp: use isASCII for now, ask alex what exactly this check should be
800-
if c.isASCII && c != "\r\n" {
801-
builder.buildMatchScalar(c.unicodeScalars.first!, boundaryCheck: false)
802-
} else {
803-
// let's think about this carefully
804-
// what if our quoted literal is an ascii character + combining accent
805-
// what are the characters in the loop?
806-
807-
// I believe that if we ever have ascii + combining character in our input
808-
// string will automatically combine them into a unified character, so itll fall into this case
809-
810-
// so I don't think we ever need that boundaryCheck to be enabled, except at the end of this sequence
811-
builder.buildMatch(c)
764+
if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
765+
for char in s {
766+
// Note: only cr-lf is multiple scalars
767+
for scalar in char.unicodeScalars {
768+
// Only boundary check if we are the last scalar in the last character
769+
// to make sure that there isn't a combining scalar after the quoted literal
770+
let boundaryCheck = char == s.last! && scalar == char.unicodeScalars.last!
771+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
812772
}
813773
}
814774
} else {

Tests/RegexTests/MatchTests.swift

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ func _firstMatch(
3535
if validateOptimizations {
3636
regex._setCompilerOptionsForTesting(.disableOptimizations)
3737
guard let unoptResult = try regex.firstMatch(in: input) else {
38+
XCTFail("Optimized regex for \(regexStr) matched on \(input) when unoptimized regex did not")
3839
throw MatchError("match not found for unoptimized \(regexStr) in \(input)")
3940
}
4041
XCTAssertEqual(
@@ -161,9 +162,10 @@ func firstMatchTest(
161162
} catch {
162163
// FIXME: This allows non-matches to succeed even when xfail'd
163164
// When xfail == true, this should report failure for match == nil
164-
if !xfail && match != nil {
165-
XCTFail("\(error)", file: file, line: line)
165+
if xfail || (match == nil && error is MatchError) {
166+
return
166167
}
168+
XCTFail("\(error)", file: file, line: line)
167169
return
168170
}
169171
}
@@ -596,6 +598,12 @@ extension RegexTests {
596598
("A", true),
597599
("a", false))
598600

601+
matchTest(#"(?i)[a]"#,
602+
("💿", false),
603+
("a\u{301}", false),
604+
("A", true),
605+
("a", true))
606+
599607
matchTest("[a]",
600608
("a\u{301}", false))
601609

@@ -1824,6 +1832,15 @@ extension RegexTests {
18241832

18251833
// TODO: Add test for grapheme boundaries at start/end of match
18261834

1835+
func testScalarOptimization() throws {
1836+
// check that we are correctly doing the boundary check after matchScalar
1837+
firstMatchTest("a", input: "a\u{301}", match: nil)
1838+
firstMatchTest("aa", input: "aa\u{301}", match: nil)
1839+
// let regex = "aa"
1840+
// let input = "aa\u{301}"
1841+
// XCTAssertEqual(regex.firstMatch(of: input), nil)
1842+
}
1843+
18271844
func testCase() {
18281845
let regex = try! Regex(#".\N{SPARKLING HEART}."#)
18291846
let input = "🧟‍♀️💖🧠 or 🧠💖☕️"

0 commit comments

Comments
 (0)