Skip to content

Commit e3d7ad7

Browse files
committed
Emit matchScalar in quotedLiteral when in unicode scalar mode
1 parent 06c77c7 commit e3d7ad7

File tree

4 files changed

+106
-67
lines changed

4 files changed

+106
-67
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 68 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,73 @@ fileprivate extension Compiler.ByteCodeGen {
7474
}
7575
}
7676

77+
mutating func emitQuotedLiteral(_ s: String) {
78+
if options.semanticLevel == .graphemeCluster {
79+
if options.isCaseInsensitive {
80+
// future work: if all ascii, emit matchBitset instructions with
81+
// case insensitive bitsets
82+
83+
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
84+
builder.buildConsume { input, bounds in
85+
var iterator = s.makeIterator()
86+
var currentIndex = bounds.lowerBound
87+
while let ch = iterator.next() {
88+
guard currentIndex < bounds.upperBound,
89+
ch.lowercased() == input[currentIndex].lowercased()
90+
else { return nil }
91+
input.formIndex(after: &currentIndex)
92+
}
93+
return currentIndex
94+
}
95+
} else {
96+
if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
97+
for char in s {
98+
// Note: only cr-lf is multiple scalars
99+
for scalar in char.unicodeScalars {
100+
// Only boundary check if we are the last scalar in the last character
101+
// to make sure that there isn't a combining scalar after the quoted literal
102+
let boundaryCheck = char == s.last! && scalar == char.unicodeScalars.last!
103+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
104+
}
105+
}
106+
} else {
107+
builder.buildMatchSequence(s)
108+
}
109+
}
110+
} else {
111+
if optimizationsEnabled && !options.isCaseInsensitive {
112+
// Match all scalars exactly, never boundary check because we're in
113+
// unicode scalars mode
114+
for char in s {
115+
for scalar in char.unicodeScalars {
116+
builder.buildMatchScalar(scalar, boundaryCheck: false)
117+
}
118+
}
119+
} else {
120+
builder.buildConsume {
121+
[caseInsensitive = options.isCaseInsensitive] input, bounds in
122+
// TODO: Case folding
123+
var iterator = s.unicodeScalars.makeIterator()
124+
var currentIndex = bounds.lowerBound
125+
while let scalar = iterator.next() {
126+
guard currentIndex < bounds.upperBound else { return nil }
127+
if caseInsensitive {
128+
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
129+
return nil
130+
}
131+
} else {
132+
if scalar != input.unicodeScalars[currentIndex] {
133+
return nil
134+
}
135+
}
136+
input.unicodeScalars.formIndex(after: &currentIndex)
137+
}
138+
return currentIndex
139+
}
140+
}
141+
}
142+
}
143+
77144
mutating func emitBackreference(
78145
_ ref: AST.Reference
79146
) throws {
@@ -747,57 +814,7 @@ fileprivate extension Compiler.ByteCodeGen {
747814
try emitAtom(a)
748815

749816
case let .quotedLiteral(s):
750-
if options.semanticLevel == .graphemeCluster {
751-
if options.isCaseInsensitive {
752-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
753-
builder.buildConsume { input, bounds in
754-
var iterator = s.makeIterator()
755-
var currentIndex = bounds.lowerBound
756-
while let ch = iterator.next() {
757-
guard currentIndex < bounds.upperBound,
758-
ch.lowercased() == input[currentIndex].lowercased()
759-
else { return nil }
760-
input.formIndex(after: &currentIndex)
761-
}
762-
return currentIndex
763-
}
764-
} else {
765-
if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
766-
for char in s {
767-
// Note: only cr-lf is multiple scalars
768-
for scalar in char.unicodeScalars {
769-
// Only boundary check if we are the last scalar in the last character
770-
// to make sure that there isn't a combining scalar after the quoted literal
771-
let boundaryCheck = char == s.last! && scalar == char.unicodeScalars.last!
772-
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
773-
}
774-
}
775-
} else {
776-
builder.buildMatchSequence(s)
777-
}
778-
}
779-
} else {
780-
builder.buildConsume {
781-
[caseInsensitive = options.isCaseInsensitive] input, bounds in
782-
// TODO: Case folding
783-
var iterator = s.unicodeScalars.makeIterator()
784-
var currentIndex = bounds.lowerBound
785-
while let scalar = iterator.next() {
786-
guard currentIndex < bounds.upperBound else { return nil }
787-
if caseInsensitive {
788-
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
789-
return nil
790-
}
791-
} else {
792-
if scalar != input.unicodeScalars[currentIndex] {
793-
return nil
794-
}
795-
}
796-
input.unicodeScalars.formIndex(after: &currentIndex)
797-
}
798-
return currentIndex
799-
}
800-
}
817+
emitQuotedLiteral(s)
801818

802819
case let .regexLiteral(l):
803820
return try emitNode(l.ast.dslTreeNode)

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ extension Instruction {
9090
/// Match against a set of valid ascii values stored in a bitset
9191
/// Operand: Ascii bitset register containing the bitset
9292
case matchBitset
93-
/// Match a single scalar instead of a character
93+
/// matchBitset but emitted in unicode scalar semantic mode, matches and advances a single scalar
9494
case matchBitsetScalar
9595

9696
/// TODO: builtin assertions and anchors

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -255,20 +255,8 @@ extension Processor {
255255
// ascii characters, so check if the current input element is ascii then
256256
// check if it is set in the bitset
257257
mutating func matchBitset(
258-
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
259-
scalar: Bool
258+
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset
260259
) -> Bool {
261-
if scalar {
262-
guard let curScalar = loadScalar(),
263-
bitset.matches(scalar: curScalar),
264-
let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: false) else {
265-
signalFailure()
266-
return false
267-
}
268-
currentPosition = idx
269-
return true
270-
}
271-
272260
guard let cur = load(), bitset.matches(char: cur) else {
273261
signalFailure()
274262
return false
@@ -277,6 +265,20 @@ extension Processor {
277265
return true
278266
}
279267

268+
// Equivalent of matchBitset but emitted when in unicode scalar semantic mode
269+
mutating func matchBitsetScalar(
270+
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset
271+
) -> Bool {
272+
guard let curScalar = loadScalar(),
273+
bitset.matches(scalar: curScalar),
274+
let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: false) else {
275+
signalFailure()
276+
return false
277+
}
278+
currentPosition = idx
279+
return true
280+
}
281+
280282
mutating func signalFailure() {
281283
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
282284
savePoints.popLast()?.destructure
@@ -428,13 +430,13 @@ extension Processor {
428430
case .matchBitset:
429431
let reg = payload.bitset
430432
let bitset = registers[reg]
431-
if matchBitset(bitset, scalar: false) {
433+
if matchBitset(bitset) {
432434
controller.step()
433435
}
434436
case .matchBitsetScalar:
435437
let reg = payload.bitset
436438
let bitset = registers[reg]
437-
if matchBitset(bitset, scalar: true) {
439+
if matchBitsetScalar(bitset) {
438440
controller.step()
439441
}
440442

Tests/RegexTests/MatchTests.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,26 @@ extension RegexTests {
636636
("\r", true),
637637
("\r\n", false))
638638

639+
do {
640+
let r = #"[a]\u0301"#
641+
var regex = try Regex(r).matchingSemantics(.unicodeScalar)
642+
let input: String = "a\u{301}"
643+
// Should match in unicode semantic mode because the character class
644+
// should consume the a and then matchScalar should match the \u{301}
645+
regex._debug()
646+
XCTAssertEqual("a\u{301}", try regex.wholeMatch(in: input)?.0)
647+
// validate this is the same in unoptimized mode
648+
regex._setCompilerOptionsForTesting(.disableOptimizations)
649+
XCTAssertEqual("a\u{301}", try regex.wholeMatch(in: input)?.0)
650+
651+
// Should not match in grapheme semantic mode because a\u{301} is
652+
// a single character
653+
matchTest(r,
654+
(input, false))
655+
} catch {
656+
XCTFail("\(error)", file: #filePath, line: #line)
657+
}
658+
639659
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
640660

641661
// These are metacharacters in certain contexts, but normal characters

0 commit comments

Comments
 (0)