Skip to content

Commit 1a359b4

Browse files
committed
Add case-insensitive match instructions
Add case-insensitive match instructions
1 parent 6e4c2bd commit 1a359b4

File tree

5 files changed

+168
-91
lines changed

5 files changed

+168
-91
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 24 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ fileprivate extension Compiler.ByteCodeGen {
4545
emitAny()
4646

4747
case let .char(c):
48-
try emitCharacter(c)
48+
emitCharacter(c)
4949

5050
case let .scalar(s):
51-
try emitScalar(s)
51+
emitScalar(s)
5252

5353
case let .assertion(kind):
5454
try emitAssertion(kind.ast)
@@ -74,81 +74,32 @@ fileprivate extension Compiler.ByteCodeGen {
7474
}
7575
}
7676

77-
mutating func emitScalarQuotedLiteral(_ s: String) {
78-
precondition(options.semanticLevel == .unicodeScalar)
79-
if optimizationsEnabled && !options.isCaseInsensitive {
80-
// Match all scalars exactly, never boundary check because we're in
81-
// unicode scalars mode
82-
for char in s {
83-
for scalar in char.unicodeScalars {
84-
builder.buildMatchScalar(scalar, boundaryCheck: false)
85-
}
86-
}
87-
return
88-
}
89-
90-
builder.buildConsume {
91-
[caseInsensitive = options.isCaseInsensitive] input, bounds in
92-
// TODO: Case folding
93-
var iterator = s.unicodeScalars.makeIterator()
94-
var currentIndex = bounds.lowerBound
95-
while let scalar = iterator.next() {
96-
guard currentIndex < bounds.upperBound else { return nil }
97-
if caseInsensitive {
98-
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
99-
return nil
100-
}
101-
} else {
102-
if scalar != input.unicodeScalars[currentIndex] {
103-
return nil
104-
}
105-
}
106-
input.unicodeScalars.formIndex(after: &currentIndex)
107-
}
108-
return currentIndex
109-
}
110-
}
111-
11277
mutating func emitQuotedLiteral(_ s: String) {
11378
guard options.semanticLevel == .graphemeCluster else {
114-
emitScalarQuotedLiteral(s)
115-
return
116-
}
117-
118-
if options.isCaseInsensitive {
119-
// future work: if all ascii, emit matchBitset instructions with
120-
// case insensitive bitsets
121-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
122-
builder.buildConsume { input, bounds in
123-
var iterator = s.makeIterator()
124-
var currentIndex = bounds.lowerBound
125-
while let ch = iterator.next() {
126-
guard currentIndex < bounds.upperBound,
127-
ch.lowercased() == input[currentIndex].lowercased()
128-
else { return nil }
129-
input.formIndex(after: &currentIndex)
79+
for char in s {
80+
for scalar in char.unicodeScalars {
81+
emitScalar(scalar)
13082
}
131-
return currentIndex
13283
}
13384
return
13485
}
13586

87+
// Fast path for eliding boundary checks for an all ascii quoted literal
13688
if optimizationsEnabled && s.allSatisfy({char in char.isASCII}) {
13789
let lastIdx = s.unicodeScalars.indices.last!
13890
for idx in s.unicodeScalars.indices {
139-
if idx == lastIdx {
140-
// Only boundary check if we are the last scalar in the last character
141-
// to make sure that there isn't a combining scalar after the quoted literal
142-
builder.buildMatchScalar(s.unicodeScalars[idx], boundaryCheck: true)
91+
let boundaryCheck = idx == lastIdx
92+
let scalar = s.unicodeScalars[idx]
93+
if options.isCaseInsensitive && scalar.properties.isCased {
94+
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
14395
} else {
144-
builder.buildMatchScalar(s.unicodeScalars[idx], boundaryCheck: false)
96+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
14597
}
14698
}
14799
return
148100
}
149101

150-
builder.buildMatchSequence(s)
151-
return
102+
for c in s { emitCharacter(c) }
152103
}
153104

154105
mutating func emitBackreference(
@@ -286,35 +237,29 @@ fileprivate extension Compiler.ByteCodeGen {
286237
}
287238
}
288239

289-
mutating func emitScalar(_ s: UnicodeScalar) throws {
290-
if options.isCaseInsensitive {
291-
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
292-
builder.buildConsume(by: consumeScalar {
293-
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
294-
})
295-
return
240+
mutating func emitScalar(_ s: UnicodeScalar) {
241+
if options.isCaseInsensitive && s.properties.isCased {
242+
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
243+
} else {
244+
builder.buildMatchScalar(s, boundaryCheck: false)
296245
}
297-
298-
builder.buildMatchScalar(s, boundaryCheck: false)
299246
}
300247

301-
mutating func emitCharacter(_ c: Character) throws {
248+
mutating func emitCharacter(_ c: Character) {
302249
// Unicode scalar mode matches the specific scalars that comprise a character
303250
if options.semanticLevel == .unicodeScalar {
304251
for scalar in c.unicodeScalars {
305-
try emitScalar(scalar)
252+
emitScalar(scalar)
306253
}
307254
return
308255
}
309256

310257
if options.isCaseInsensitive && c.isCased {
311-
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
312-
builder.buildConsume { input, bounds in
313-
let inputChar = input[bounds.lowerBound].lowercased()
314-
let matchChar = c.lowercased()
315-
return inputChar == matchChar
316-
? input.index(after: bounds.lowerBound)
317-
: nil
258+
if optimizationsEnabled && c.isASCII {
259+
// c.isCased ensures that c is not CR-LF, so we know that c is a single scalar
260+
builder.buildMatchScalarCaseInsensitive(c.unicodeScalars.last!, boundaryCheck: true)
261+
} else {
262+
builder.buildMatchCaseInsensitive(c)
318263
}
319264
return
320265
}

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,42 @@ extension Instruction {
7777
/// Operand: Element register to compare against.
7878
case match
7979

80+
/// Matches the given character case insensitively
81+
///
82+
/// match(_: EltReg)
83+
///
84+
/// Operand: Element register to compare against.
85+
case matchCaseInsensitive
86+
8087
/// Match against a sequence of elements
8188
///
8289
/// matchSequence(_: SeqReg)
8390
///
8491
/// Operand: Sequence register to compare against.
8592
case matchSequence
8693

94+
/// Match against a scalar and perform a grapheme boundary check
95+
///
96+
/// matchScalar(_: Unicode.Scalar)
97+
/// Operand: Scalar value to match against
8798
case matchScalar
99+
/// Match against a scalar and do NOT perform a grapheme boundary check
100+
///
101+
/// matchScalarUnchecked(_: Unicode.Scalar)
102+
/// Operand: Scalar value to match against
88103
case matchScalarUnchecked
89104

105+
/// Match against a scalar case insensitively and perform a grapheme boundary check
106+
///
107+
/// matchScalarCaseInsensitive(_: Unicode.Scalar)
108+
/// Operand: Scalar value to match against
109+
case matchScalarCaseInsensitive
110+
/// Match against a scalar case insensitively and do NOT perform a grapheme boundary check
111+
///
112+
/// matchScalarCaseInsensitiveUnchecked(_: Unicode.Scalar)
113+
/// Operand: Scalar value to match against
114+
case matchScalarCaseInsensitiveUnchecked
115+
90116
/// Match against a set of valid ascii values stored in a bitset
91117
/// Operand: Ascii bitset register containing the bitset
92118
case matchBitset

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ extension MEProgram.Builder {
140140
.match, .init(element: elements.store(e))))
141141
}
142142

143+
mutating func buildMatchCaseInsensitive(_ e: Character) {
144+
instructions.append(.init(
145+
.matchCaseInsensitive, .init(element: elements.store(e))))
146+
}
147+
143148
mutating func buildMatchSequence<S: Sequence>(
144149
_ s: S
145150
) where S.Element == Character {
@@ -155,6 +160,15 @@ extension MEProgram.Builder {
155160
instructions.append(.init(.matchScalarUnchecked, .init(scalar: s)))
156161
}
157162
}
163+
164+
mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) {
165+
if boundaryCheck {
166+
instructions.append(.init(.matchScalarCaseInsensitive, .init(scalar: s)))
167+
} else {
168+
instructions.append(.init(.matchScalarCaseInsensitiveUnchecked, .init(scalar: s)))
169+
}
170+
}
171+
158172

159173
mutating func buildMatchAsciiBitset(
160174
_ b: DSLTree.CustomCharacterClass.AsciiBitset

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,15 @@ extension Processor {
216216
return true
217217
}
218218

219+
mutating func matchCaseInsensitive(_ e: Element) -> Bool {
220+
guard let cur = load(), cur.lowercased() == e.lowercased() else {
221+
signalFailure()
222+
return false
223+
}
224+
_uncheckedForcedConsumeOne()
225+
return true
226+
}
227+
219228
// Match against the current input prefix. Returns whether
220229
// it succeeded vs signaling an error.
221230
mutating func matchSeq<C: Collection>(
@@ -233,7 +242,29 @@ extension Processor {
233242

234243
mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool {
235244
guard s == loadScalar(),
236-
let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end),
245+
let idx = input.unicodeScalars.index(
246+
currentPosition,
247+
offsetBy: 1,
248+
limitedBy: end),
249+
(!boundaryCheck || input.isOnGraphemeClusterBoundary(idx))
250+
else {
251+
signalFailure()
252+
return false
253+
}
254+
currentPosition = idx
255+
return true
256+
}
257+
258+
mutating func matchScalarCaseInsensitive(
259+
_ s: Unicode.Scalar,
260+
boundaryCheck: Bool
261+
) -> Bool {
262+
guard let curScalar = loadScalar(),
263+
s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping,
264+
let idx = input.unicodeScalars.index(
265+
currentPosition,
266+
offsetBy: 1,
267+
limitedBy: end),
237268
(!boundaryCheck || input.isOnGraphemeClusterBoundary(idx))
238269
else {
239270
signalFailure()
@@ -400,6 +431,11 @@ extension Processor {
400431
if match(registers[reg]) {
401432
controller.step()
402433
}
434+
case .matchCaseInsensitive:
435+
let reg = payload.element
436+
if matchCaseInsensitive(registers[reg]) {
437+
controller.step()
438+
}
403439

404440
case .matchSequence:
405441
let reg = payload.sequence
@@ -418,6 +454,16 @@ extension Processor {
418454
if matchScalar(scalar, boundaryCheck: false) {
419455
controller.step()
420456
}
457+
case .matchScalarCaseInsensitive:
458+
let scalar = payload.scalar
459+
if matchScalarCaseInsensitive(scalar, boundaryCheck: true) {
460+
controller.step()
461+
}
462+
case .matchScalarCaseInsensitiveUnchecked:
463+
let scalar = payload.scalar
464+
if matchScalarCaseInsensitive(scalar, boundaryCheck: false) {
465+
controller.step()
466+
}
421467

422468
case .matchBitset:
423469
let reg = payload.bitset

0 commit comments

Comments
 (0)