Skip to content

Commit 5abd9b5

Browse files
committed
Parse conditional patterns
This parses a superset of the conditional syntax accepted by PCRE, Oniguruma & .NET (ICU and Java do not support this feature). This includes the ability for the condition to execute an arbitrary regex match as a group. Note there's some ambiguity with expressions of the form `(?(xxx))`, PCRE always treats this as a named group reference, whereas .NET only treats it as such if a group is defined with that name anywhere in the regex (otherwise it interprets it as arbitrary regex condition). For now, check to see if a prior group has been defined with that name. This will need to be updated in the future to check for any proceeding groups with that name if we want to fully match the .NET behavior here. This commit moves AST.Atom.Reference -> AST.Reference, as it's now also being used by AST.Conditional.
1 parent e8df271 commit 5abd9b5

16 files changed

+585
-65
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ public indirect enum AST:
2323
/// (...)
2424
case group(Group)
2525

26+
/// (?(cond) true-branch | false-branch)
27+
case conditional(Conditional)
28+
2629
case quantification(Quantification)
2730

2831
/// \Q...\E
@@ -55,6 +58,7 @@ extension AST {
5558
case let .alternation(v): return v
5659
case let .concatenation(v): return v
5760
case let .group(v): return v
61+
case let .conditional(v): return v
5862
case let .quantification(v): return v
5963
case let .quote(v): return v
6064
case let .trivia(v): return v
@@ -163,6 +167,42 @@ extension AST {
163167
self.location = location
164168
}
165169
}
170+
171+
public struct Reference: Hashable {
172+
@frozen
173+
public enum Kind: Hashable {
174+
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
175+
// Oniguruma: \k<n>, \k'n'
176+
case absolute(Int)
177+
178+
// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
179+
// (?(+n)... (?(-n)...
180+
// Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n'
181+
case relative(Int)
182+
183+
// \k<name> \k'name' \g{name} \k{name} (?P=name)
184+
// \g<name> \g'name' (?&name) (?P>name)
185+
// (?(<name>)... (?('name')... (?(name)...
186+
case named(String)
187+
188+
/// (?R), (?(R)..., which are equivalent to (?0), (?(0)...
189+
static var recurseWholePattern: Kind { .absolute(0) }
190+
}
191+
public var kind: Kind
192+
193+
/// The location of the inner numeric or textual reference, e.g the location
194+
/// of '-2' in '\g{-2}'.
195+
public var innerLoc: SourceLocation
196+
197+
public init(_ kind: Kind, innerLoc: SourceLocation) {
198+
self.kind = kind
199+
self.innerLoc = innerLoc
200+
}
201+
202+
/// Whether this is a reference that recurses the whole pattern, rather than
203+
/// a group.
204+
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
205+
}
166206
}
167207

168208
// FIXME: Get this out of here

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 4 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ extension AST {
6666
// References
6767
case backreference(Reference)
6868
case subpattern(Reference)
69-
case condition(Reference)
7069
}
7170
}
7271
}
@@ -382,44 +381,6 @@ extension AST.Atom.CharacterProperty {
382381
}
383382
}
384383

385-
extension AST.Atom {
386-
public struct Reference: Hashable {
387-
@frozen
388-
public enum Kind: Hashable {
389-
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
390-
// Oniguruma: \k<n>, \k'n'
391-
case absolute(Int)
392-
393-
// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
394-
// (?(+n)... (?(-n)...
395-
// Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n'
396-
case relative(Int)
397-
398-
// \k<name> \k'name' \g{name} \k{name} (?P=name)
399-
// \g<name> \g'name' (?&name) (?P>name)
400-
// (?(<name>)... (?('name')... (?(name)...
401-
case named(String)
402-
403-
/// (?R), (?(R)..., which are equivalent to (?0), (?(0)...
404-
static var recurseWholePattern: Kind { .absolute(0) }
405-
}
406-
public var kind: Kind
407-
408-
/// The location of the inner numeric or textual reference, e.g the location
409-
/// of '-2' in '\g{-2}'.
410-
public var innerLoc: SourceLocation
411-
412-
public init(_ kind: Kind, innerLoc: SourceLocation) {
413-
self.kind = kind
414-
self.innerLoc = innerLoc
415-
}
416-
417-
/// Whether this is a reference that recurses the whole pattern, rather than
418-
/// a group.
419-
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
420-
}
421-
}
422-
423384
extension AST.Atom {
424385
/// Anchors and other built-in zero-width assertions
425386
@frozen
@@ -497,7 +458,7 @@ extension AST.Atom {
497458
fallthrough
498459

499460
case .property, .escaped, .any, .startOfLine, .endOfLine,
500-
.backreference, .subpattern, .condition, .namedCharacter:
461+
.backreference, .subpattern, .namedCharacter:
501462
return nil
502463
}
503464
}
@@ -522,7 +483,7 @@ extension AST.Atom {
522483
return "\\M-\\C-\(x)"
523484

524485
case .property, .escaped, .any, .startOfLine, .endOfLine,
525-
.backreference, .subpattern, .condition, .namedCharacter:
486+
.backreference, .subpattern, .namedCharacter:
526487
return nil
527488
}
528489
}
@@ -534,8 +495,8 @@ extension AST {
534495
case .atom(let a): return a.literalStringValue
535496

536497
case .alternation, .concatenation, .group,
537-
.quantification, .quote, .trivia,
538-
.customCharacterClass, .empty,
498+
.conditional, .quantification, .quote,
499+
.trivia, .customCharacterClass, .empty,
539500
.groupTransform:
540501
return nil
541502
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
extension AST {
13+
public struct Conditional: Hashable, _ASTNode {
14+
public var location: SourceLocation
15+
public var condition: Condition
16+
17+
public var trueBranch: AST
18+
public var pipe: SourceLocation?
19+
public var falseBranch: AST
20+
21+
public init(
22+
_ condition: Condition, trueBranch: AST, pipe: SourceLocation?,
23+
falseBranch: AST, _ location: SourceLocation
24+
) {
25+
self.location = location
26+
self.condition = condition
27+
self.trueBranch = trueBranch
28+
self.pipe = pipe
29+
self.falseBranch = falseBranch
30+
}
31+
}
32+
}
33+
34+
extension AST.Conditional {
35+
public struct Condition: Hashable {
36+
public enum Kind: Hashable {
37+
/// Check to see if a certain group was matched.
38+
case groupMatched(AST.Reference)
39+
40+
// Check for recursion.
41+
case recursionCheck
42+
case groupRecursionCheck(AST.Reference)
43+
44+
/// Define a new group that can be referenced elsewhere.
45+
case defineGroup
46+
47+
/// A PCRE version check.
48+
case pcreVersionCheck(PCREVersionCheck)
49+
50+
/// A group condition, which checks to see if an arbitrary bit of regex
51+
/// matches. Note that the semantics of this differs by engine, .NET only
52+
/// treats it as a lookahead, whereas Oniguruma can evaluate separately
53+
/// from the body of the conditional.
54+
case group(AST.Group)
55+
}
56+
57+
public var kind: Kind
58+
public var location: SourceLocation
59+
60+
public init(_ kind: Kind, _ location: SourceLocation) {
61+
self.kind = kind
62+
self.location = location
63+
}
64+
}
65+
}
66+
67+
extension AST.Conditional.Condition {
68+
public struct PCREVersionNumber: Hashable {
69+
public var major: Int
70+
public var minor: Int
71+
public var location: SourceLocation
72+
73+
public init(major: Int, minor: Int, _ location: SourceLocation) {
74+
self.major = major
75+
self.minor = minor
76+
self.location = location
77+
}
78+
}
79+
public struct PCREVersionCheck: Hashable {
80+
public enum Kind: Hashable {
81+
case equal, greaterThanOrEqual
82+
}
83+
public var kind: AST.Located<Kind>
84+
public var num: PCREVersionNumber
85+
86+
public init(_ kind: AST.Located<Kind>, _ num: PCREVersionNumber) {
87+
self.kind = kind
88+
self.num = num
89+
}
90+
}
91+
}

Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,21 @@ extension AST {
4747
default:
4848
return innerCaptures
4949
}
50+
case .conditional(let c):
51+
// A conditional's capture structure is effectively that of an alternation
52+
// between the true and false branches. However the condition may also
53+
// have captures in the case of a group condition.
54+
var captures = CaptureStructure.empty
55+
switch c.condition.kind {
56+
case .group(let g):
57+
captures = captures + AST.group(g).captureStructure
58+
default:
59+
break
60+
}
61+
let branchCaptures = c.trueBranch.captureStructure +
62+
c.falseBranch.captureStructure
63+
return captures + branchCaptures.map(CaptureStructure.optional)
64+
5065
case .quantification(let quantification):
5166
return quantification.child.captureStructure.map(
5267
quantification.amount.value == .zeroOrOne

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ enum ParseError: Error, Hashable {
2828
// Something happened, fall-back for now
2929
case misc(String)
3030

31+
case tooManyBranchesInConditional(Int)
32+
3133
case expectedASCII(Character)
3234

3335
case expectedNonEmptyContents
@@ -77,6 +79,8 @@ extension ParseError: CustomStringConvertible {
7779
return "expected escape sequence"
7880
case .cannotReferToWholePattern:
7981
return "cannot refer to whole pattern here"
82+
case let .tooManyBranchesInConditional(i):
83+
return "expected 2 branches in conditional, have \(i)"
8084
case let .unknownGroupKind(str):
8185
return "unknown group kind '(\(str)'"
8286
case let .invalidMatchingOption(c):

0 commit comments

Comments
 (0)