Skip to content

Commit 06c77c7

Browse files
committed
Add scalar mode support for matching bitsets + fix bug
1 parent 809b085 commit 06c77c7

File tree

8 files changed

+79
-22
lines changed

8 files changed

+79
-22
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -665,11 +665,12 @@ fileprivate extension Compiler.ByteCodeGen {
665665
_ ccc: DSLTree.CustomCharacterClass
666666
) throws {
667667
if let asciiBitset = ccc.asAsciiBitset(options),
668-
options.semanticLevel == .graphemeCluster,
669668
optimizationsEnabled {
670-
// future work: add a bit to .matchBitset to consume either a character
671-
// or a scalar so we can have this optimization in scalar mode
672-
builder.buildMatchAsciiBitset(asciiBitset)
669+
if options.semanticLevel == .unicodeScalar {
670+
builder.buildScalarMatchAsciiBitset(asciiBitset)
671+
} else {
672+
builder.buildMatchAsciiBitset(asciiBitset)
673+
}
673674
} else {
674675
let consumer = try ccc.generateConsumer(options)
675676
builder.buildConsume(by: consumer)

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@
1111

1212
@_implementationOnly import _RegexParser
1313

14+
extension Character {
15+
var singleScalarAsciiValue: UInt8? {
16+
if let val = asciiValue, self != "\r\n" {
17+
return val
18+
}
19+
return nil
20+
}
21+
}
22+
23+
1424
extension DSLTree.Node {
1525
/// Attempt to generate a consumer from this AST node
1626
///
@@ -60,8 +70,8 @@ extension DSLTree._AST.Atom {
6070
extension DSLTree.Atom {
6171
var singleScalarASCIIValue: UInt8? {
6272
switch self {
63-
case let .char(c) where c != "\r\n":
64-
return c.asciiValue
73+
case let .char(c):
74+
return c.singleScalarAsciiValue
6575
case let .scalar(s) where s.isASCII:
6676
return UInt8(ascii: s)
6777
case let .unconverted(atom):
@@ -214,8 +224,8 @@ extension AST.Atom {
214224

215225
var singleScalarASCIIValue: UInt8? {
216226
switch kind {
217-
case let .char(c) where c != "\r\n":
218-
return c.asciiValue
227+
case let .char(c):
228+
return c.singleScalarAsciiValue
219229
case let .scalar(s) where s.value.isASCII:
220230
return UInt8(ascii: s.value)
221231
default:

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ extension Instruction {
9090
/// Match against a set of valid ascii values stored in a bitset
9191
/// Operand: Ascii bitset register containing the bitset
9292
case matchBitset
93+
/// Match a single scalar instead of a character
94+
case matchBitsetScalar
9395

9496
/// TODO: builtin assertions and anchors
9597
case builtinAssertion

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,13 @@ extension MEProgram.Builder {
163163
.matchBitset, .init(bitset: makeAsciiBitset(b))))
164164
}
165165

166+
mutating func buildScalarMatchAsciiBitset(
167+
_ b: DSLTree.CustomCharacterClass.AsciiBitset
168+
) {
169+
instructions.append(.init(
170+
.matchBitsetScalar, .init(bitset: makeAsciiBitset(b))))
171+
}
172+
166173
mutating func buildConsume(
167174
by p: @escaping MEProgram.ConsumeFunction
168175
) {

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,20 @@ extension Processor {
255255
// ascii characters, so check if the current input element is ascii then
256256
// check if it is set in the bitset
257257
mutating func matchBitset(
258-
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset
258+
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
259+
scalar: Bool
259260
) -> Bool {
261+
if scalar {
262+
guard let curScalar = loadScalar(),
263+
bitset.matches(scalar: curScalar),
264+
let idx = nextScalarIndex(offsetBy: 1, boundaryCheck: false) else {
265+
signalFailure()
266+
return false
267+
}
268+
currentPosition = idx
269+
return true
270+
}
271+
260272
guard let cur = load(), bitset.matches(char: cur) else {
261273
signalFailure()
262274
return false
@@ -416,7 +428,13 @@ extension Processor {
416428
case .matchBitset:
417429
let reg = payload.bitset
418430
let bitset = registers[reg]
419-
if matchBitset(bitset) {
431+
if matchBitset(bitset, scalar: false) {
432+
controller.step()
433+
}
434+
case .matchBitsetScalar:
435+
let reg = payload.bitset
436+
let bitset = registers[reg]
437+
if matchBitset(bitset, scalar: true) {
420438
controller.step()
421439
}
422440

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -215,23 +215,41 @@ extension DSLTree {
215215
}
216216
}
217217

218+
private func matches(_ val: UInt8) -> Bool {
219+
if val < 64 {
220+
return (a >> val) & 1 == 1
221+
} else {
222+
return (b >> (val - 64)) & 1 == 1
223+
}
224+
}
225+
218226
internal func matches(char: Character) -> Bool {
219-
let ret: Bool
220-
if let val = char.asciiValue {
221-
if val < 64 {
222-
ret = (a >> val) & 1 == 1
223-
} else {
224-
ret = (b >> (val - 64)) & 1 == 1
225-
}
227+
let matched: Bool
228+
if let val = char.singleScalarAsciiValue {
229+
matched = matches(val)
226230
} else {
227-
ret = false
231+
matched = false
228232
}
229233

230234
if isInverted {
231-
return !ret
235+
return !matched
236+
}
237+
return matched
238+
}
239+
240+
internal func matches(scalar: Unicode.Scalar) -> Bool {
241+
let matched: Bool
242+
if scalar.isASCII {
243+
let val = UInt8(ascii: scalar)
244+
matched = matches(val)
245+
} else {
246+
matched = false
232247
}
233248

234-
return ret
249+
if isInverted {
250+
return !matched
251+
}
252+
return matched
235253
}
236254

237255
/// Joins another bitset from a Member of the same CustomCharacterClass

Tests/RegexTests/CompileTests.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,6 @@ extension RegexTests {
206206
expectProgram(for: "[abc]", doesNotContain: [.consumeBy])
207207

208208
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset])
209-
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy])
209+
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar])
210210
}
211211
}

Tests/RegexTests/MatchTests.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,8 @@ extension RegexTests {
633633
("\r", true))
634634
matchTest("[\n\r]",
635635
("\n", true),
636-
("\r", true))
636+
("\r", true),
637+
("\r\n", false))
637638

638639
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
639640

0 commit comments

Comments
 (0)