File tree Expand file tree Collapse file tree 8 files changed +79
-22
lines changed
Sources/_StringProcessing Expand file tree Collapse file tree 8 files changed +79
-22
lines changed Original file line number Diff line number Diff line change @@ -665,11 +665,12 @@ fileprivate extension Compiler.ByteCodeGen {
665
665
_ ccc: DSLTree . CustomCharacterClass
666
666
) throws {
667
667
if let asciiBitset = ccc. asAsciiBitset ( options) ,
668
- options. semanticLevel == . graphemeCluster,
669
668
optimizationsEnabled {
670
- // future work: add a bit to .matchBitset to consume either a character
671
- // or a scalar so we can have this optimization in scalar mode
672
- builder. buildMatchAsciiBitset ( asciiBitset)
669
+ if options. semanticLevel == . unicodeScalar {
670
+ builder. buildScalarMatchAsciiBitset ( asciiBitset)
671
+ } else {
672
+ builder. buildMatchAsciiBitset ( asciiBitset)
673
+ }
673
674
} else {
674
675
let consumer = try ccc. generateConsumer ( options)
675
676
builder. buildConsume ( by: consumer)
Original file line number Diff line number Diff line change 11
11
12
12
@_implementationOnly import _RegexParser
13
13
14
+ extension Character {
15
+ var singleScalarAsciiValue : UInt8 ? {
16
+ if let val = asciiValue, self != " \r \n " {
17
+ return val
18
+ }
19
+ return nil
20
+ }
21
+ }
22
+
23
+
14
24
extension DSLTree . Node {
15
25
/// Attempt to generate a consumer from this AST node
16
26
///
@@ -60,8 +70,8 @@ extension DSLTree._AST.Atom {
60
70
extension DSLTree . Atom {
61
71
var singleScalarASCIIValue : UInt8 ? {
62
72
switch self {
63
- case let . char( c) where c != " \r \n " :
64
- return c. asciiValue
73
+ case let . char( c) :
74
+ return c. singleScalarAsciiValue
65
75
case let . scalar( s) where s. isASCII:
66
76
return UInt8 ( ascii: s)
67
77
case let . unconverted( atom) :
@@ -214,8 +224,8 @@ extension AST.Atom {
214
224
215
225
var singleScalarASCIIValue : UInt8 ? {
216
226
switch kind {
217
- case let . char( c) where c != " \r \n " :
218
- return c. asciiValue
227
+ case let . char( c) :
228
+ return c. singleScalarAsciiValue
219
229
case let . scalar( s) where s. value. isASCII:
220
230
return UInt8 ( ascii: s. value)
221
231
default :
Original file line number Diff line number Diff line change @@ -90,6 +90,8 @@ extension Instruction {
90
90
/// Match against a set of valid ascii values stored in a bitset
91
91
/// Operand: Ascii bitset register containing the bitset
92
92
case matchBitset
93
+ /// Match a single scalar instead of a character
94
+ case matchBitsetScalar
93
95
94
96
/// TODO: builtin assertions and anchors
95
97
case builtinAssertion
Original file line number Diff line number Diff line change @@ -163,6 +163,13 @@ extension MEProgram.Builder {
163
163
. matchBitset, . init( bitset: makeAsciiBitset ( b) ) ) )
164
164
}
165
165
166
+ mutating func buildScalarMatchAsciiBitset(
167
+ _ b: DSLTree . CustomCharacterClass . AsciiBitset
168
+ ) {
169
+ instructions. append ( . init(
170
+ . matchBitsetScalar, . init( bitset: makeAsciiBitset ( b) ) ) )
171
+ }
172
+
166
173
mutating func buildConsume(
167
174
by p: @escaping MEProgram . ConsumeFunction
168
175
) {
Original file line number Diff line number Diff line change @@ -255,8 +255,20 @@ extension Processor {
255
255
// ascii characters, so check if the current input element is ascii then
256
256
// check if it is set in the bitset
257
257
mutating func matchBitset(
258
- _ bitset: DSLTree . CustomCharacterClass . AsciiBitset
258
+ _ bitset: DSLTree . CustomCharacterClass . AsciiBitset ,
259
+ scalar: Bool
259
260
) -> Bool {
261
+ if scalar {
262
+ guard let curScalar = loadScalar ( ) ,
263
+ bitset. matches ( scalar: curScalar) ,
264
+ let idx = nextScalarIndex ( offsetBy: 1 , boundaryCheck: false ) else {
265
+ signalFailure ( )
266
+ return false
267
+ }
268
+ currentPosition = idx
269
+ return true
270
+ }
271
+
260
272
guard let cur = load ( ) , bitset. matches ( char: cur) else {
261
273
signalFailure ( )
262
274
return false
@@ -416,7 +428,13 @@ extension Processor {
416
428
case . matchBitset:
417
429
let reg = payload. bitset
418
430
let bitset = registers [ reg]
419
- if matchBitset ( bitset) {
431
+ if matchBitset ( bitset, scalar: false ) {
432
+ controller. step ( )
433
+ }
434
+ case . matchBitsetScalar:
435
+ let reg = payload. bitset
436
+ let bitset = registers [ reg]
437
+ if matchBitset ( bitset, scalar: true ) {
420
438
controller. step ( )
421
439
}
422
440
Original file line number Diff line number Diff line change @@ -215,23 +215,41 @@ extension DSLTree {
215
215
}
216
216
}
217
217
218
+ private func matches( _ val: UInt8 ) -> Bool {
219
+ if val < 64 {
220
+ return ( a >> val) & 1 == 1
221
+ } else {
222
+ return ( b >> ( val - 64 ) ) & 1 == 1
223
+ }
224
+ }
225
+
218
226
internal func matches( char: Character ) -> Bool {
219
- let ret : Bool
220
- if let val = char. asciiValue {
221
- if val < 64 {
222
- ret = ( a >> val) & 1 == 1
223
- } else {
224
- ret = ( b >> ( val - 64 ) ) & 1 == 1
225
- }
227
+ let matched : Bool
228
+ if let val = char. singleScalarAsciiValue {
229
+ matched = matches ( val)
226
230
} else {
227
- ret = false
231
+ matched = false
228
232
}
229
233
230
234
if isInverted {
231
- return !ret
235
+ return !matched
236
+ }
237
+ return matched
238
+ }
239
+
240
+ internal func matches( scalar: Unicode . Scalar ) -> Bool {
241
+ let matched : Bool
242
+ if scalar. isASCII {
243
+ let val = UInt8 ( ascii: scalar)
244
+ matched = matches ( val)
245
+ } else {
246
+ matched = false
232
247
}
233
248
234
- return ret
249
+ if isInverted {
250
+ return !matched
251
+ }
252
+ return matched
235
253
}
236
254
237
255
/// Joins another bitset from a Member of the same CustomCharacterClass
Original file line number Diff line number Diff line change @@ -206,6 +206,6 @@ extension RegexTests {
206
206
expectProgram ( for: " [abc] " , doesNotContain: [ . consumeBy] )
207
207
208
208
expectProgram ( for: " [abc] " , semanticLevel: . unicodeScalar, doesNotContain: [ . matchBitset] )
209
- expectProgram ( for: " [abc] " , semanticLevel: . unicodeScalar, contains: [ . consumeBy ] )
209
+ expectProgram ( for: " [abc] " , semanticLevel: . unicodeScalar, contains: [ . matchBitsetScalar ] )
210
210
}
211
211
}
Original file line number Diff line number Diff line change @@ -633,7 +633,8 @@ extension RegexTests {
633
633
( " \r " , true ) )
634
634
matchTest ( " [ \n \r ] " ,
635
635
( " \n " , true ) ,
636
- ( " \r " , true ) )
636
+ ( " \r " , true ) ,
637
+ ( " \r \n " , false ) )
637
638
638
639
firstMatchTest ( " [-] " , input: " 123-abcxyz " , match: " - " )
639
640
You can’t perform that action at this time.
0 commit comments