Skip to content

Commit 2098c07

Browse files
authored
Merge pull request #128 from milseman/split_pr
2 parents ba702f0 + e4de56d commit 2098c07

File tree

7 files changed

+205
-47
lines changed

7 files changed

+205
-47
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,12 +207,19 @@ extension AST {
207207
}
208208
public var kind: Kind
209209

210+
/// An additional specifier supported by Oniguruma that specifies what
211+
/// recursion level the group being referenced belongs to.
212+
public var recursionLevel: Located<Int>?
213+
210214
/// The location of the inner numeric or textual reference, e.g the location
211-
/// of '-2' in '\g{-2}'.
215+
/// of '-2' in '\g{-2}'. Note this includes the recursion level for e.g
216+
/// '\k<a+2>'.
212217
public var innerLoc: SourceLocation
213218

214-
public init(_ kind: Kind, innerLoc: SourceLocation) {
219+
public init(_ kind: Kind, recursionLevel: Located<Int>? = nil,
220+
innerLoc: SourceLocation) {
215221
self.kind = kind
222+
self.recursionLevel = recursionLevel
216223
self.innerLoc = innerLoc
217224
}
218225

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,32 @@ extension AST {
7676
}
7777
}
7878

79+
extension AST.Atom {
80+
private var _associatedValue: Any? {
81+
switch kind {
82+
case .char(let v): return v
83+
case .scalar(let v): return v
84+
case .property(let v): return v
85+
case .escaped(let v): return v
86+
case .keyboardControl(let v): return v
87+
case .keyboardMeta(let v): return v
88+
case .keyboardMetaControl(let v): return v
89+
case .namedCharacter(let v): return v
90+
case .backreference(let v): return v
91+
case .subpattern(let v): return v
92+
case .callout(let v): return v
93+
case .backtrackingDirective(let v): return v
94+
case .any: return nil
95+
case .startOfLine: return nil
96+
case .endOfLine: return nil
97+
}
98+
}
99+
100+
func `as`<T>(_ t: T.Type = T.self) -> T? {
101+
_associatedValue as? T
102+
}
103+
}
104+
79105
extension AST.Atom {
80106

81107
// TODO: We might scrap this and break out a few categories so

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -865,8 +865,6 @@ extension Source {
865865
return .recursionCheck
866866
}
867867

868-
// TODO: Oniguruma can also parse an additional recursion level for
869-
// group-matched checks.
870868
if let open = src.tryEat(anyOf: "<", "'") {
871869
// In PCRE, this can only be a named reference. In Oniguruma, it can
872870
// also be a numbered reference.
@@ -884,9 +882,9 @@ extension Source {
884882
}
885883

886884
// If we have a numbered reference, this is a check to see if a group
887-
// matched.
888-
if let numRef = try src.lexNumberedReference() {
889-
return .groupMatched(numRef)
885+
// matched. Oniguruma also permits a recursion level here.
886+
if let num = try src.lexNumberedReference(allowRecursionLevel: true) {
887+
return .groupMatched(num)
890888
}
891889

892890
// PCRE and .NET also allow a named reference to be parsed here. PCRE
@@ -896,9 +894,9 @@ extension Source {
896894
// FIXME: This should apply to future groups too.
897895
// TODO: We should probably advise users to use the more explicit
898896
// syntax.
899-
if let nameRef = src.lexNamedReference(endingWith: ")",
900-
eatEnding: false),
901-
context.isPriorGroupRef(nameRef.kind) {
897+
let nameRef = src.lexNamedReference(
898+
endingWith: ")", eatEnding: false, allowRecursionLevel: true)
899+
if let nameRef = nameRef, context.isPriorGroupRef(nameRef.kind) {
902900
return .groupMatched(nameRef)
903901
}
904902
return nil
@@ -1052,10 +1050,10 @@ extension Source {
10521050

10531051
/// Try to lex an absolute or relative numbered reference.
10541052
///
1055-
/// NumberRef -> ('+' | '-')? <Decimal Number>
1053+
/// NumberRef -> ('+' | '-')? <Decimal Number> RecursionLevel?
10561054
///
10571055
private mutating func lexNumberedReference(
1058-
allowWholePatternRef: Bool = false
1056+
allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false
10591057
) throws -> AST.Reference? {
10601058
let kind = try recordLoc { src -> AST.Reference.Kind? in
10611059
// Note this logic should match canLexNumberedReference.
@@ -1074,7 +1072,22 @@ extension Source {
10741072
guard allowWholePatternRef || kind.value != .recurseWholePattern else {
10751073
throw ParseError.cannotReferToWholePattern
10761074
}
1077-
return .init(kind.value, innerLoc: kind.location)
1075+
let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil
1076+
let loc = recLevel?.location.union(with: kind.location) ?? kind.location
1077+
return .init(kind.value, recursionLevel: recLevel, innerLoc: loc)
1078+
}
1079+
1080+
/// Try to consume a recursion level for a group reference.
1081+
///
1082+
/// RecursionLevel -> '+' <Int> | '-' <Int>
1083+
///
1084+
private mutating func lexRecursionLevel(
1085+
) throws -> Located<Int>? {
1086+
try recordLoc { src in
1087+
if src.tryEat("+") { return try src.expectNumber().value }
1088+
if src.tryEat("-") { return try -src.expectNumber().value }
1089+
return nil
1090+
}
10781091
}
10791092

10801093
/// Checks whether a numbered reference can be lexed.
@@ -1087,19 +1100,34 @@ extension Source {
10871100

10881101
/// Eat a named reference up to a given closing delimiter.
10891102
private mutating func expectNamedReference(
1090-
endingWith end: String, eatEnding: Bool = true
1103+
endingWith end: String, eatEnding: Bool = true,
1104+
allowRecursionLevel: Bool = false
10911105
) throws -> AST.Reference {
1092-
let str = try expectGroupName(endingWith: end, eatEnding: eatEnding)
1093-
return .init(.named(str.value), innerLoc: str.location)
1106+
// Note we don't want to eat the ending as we may also want to parse a
1107+
// recursion level.
1108+
let str = try expectGroupName(endingWith: end, eatEnding: false)
1109+
1110+
// If we're allowed to, try parse a recursion level.
1111+
let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil
1112+
let loc = recLevel?.location.union(with: str.location) ?? str.location
1113+
1114+
if eatEnding {
1115+
try expect(sequence: end)
1116+
}
1117+
return .init(.named(str.value), recursionLevel: recLevel, innerLoc: loc)
10941118
}
10951119

10961120
/// Try to consume a named reference up to a closing delimiter, returning
10971121
/// `nil` if the characters aren't valid for a named reference.
10981122
private mutating func lexNamedReference(
1099-
endingWith end: String, eatEnding: Bool = true
1123+
endingWith end: String, eatEnding: Bool = true,
1124+
allowRecursionLevel: Bool = false
11001125
) -> AST.Reference? {
11011126
tryEating { src in
1102-
try? src.expectNamedReference(endingWith: end, eatEnding: eatEnding)
1127+
try? src.expectNamedReference(
1128+
endingWith: end, eatEnding: eatEnding,
1129+
allowRecursionLevel: allowRecursionLevel
1130+
)
11031131
}
11041132
}
11051133

@@ -1109,17 +1137,22 @@ extension Source {
11091137
///
11101138
private mutating func expectNamedOrNumberedReference(
11111139
endingWith ending: String, eatEnding: Bool = true,
1112-
allowWholePatternRef: Bool = false
1140+
allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false
11131141
) throws -> AST.Reference {
1114-
if let numbered = try lexNumberedReference(
1115-
allowWholePatternRef: allowWholePatternRef
1116-
) {
1142+
let num = try lexNumberedReference(
1143+
allowWholePatternRef: allowWholePatternRef,
1144+
allowRecursionLevel: allowRecursionLevel
1145+
)
1146+
if let num = num {
11171147
if eatEnding {
11181148
try expect(sequence: ending)
11191149
}
1120-
return numbered
1150+
return num
11211151
}
1122-
return try expectNamedReference(endingWith: ending, eatEnding: eatEnding)
1152+
return try expectNamedReference(
1153+
endingWith: ending, eatEnding: eatEnding,
1154+
allowRecursionLevel: allowRecursionLevel
1155+
)
11231156
}
11241157

11251158
private static func getClosingDelimiter(
@@ -1176,11 +1209,21 @@ extension Source {
11761209
}
11771210

11781211
if src.tryEat("k") {
1179-
// Perl/.NET-style backreferences.
1180-
if let openChar = src.tryEat(anyOf: "<", "'", "{") {
1212+
// Perl/.NET/Oniguruma-style backreferences.
1213+
if let openChar = src.tryEat(anyOf: "<", "'") {
11811214
let closing = String(Source.getClosingDelimiter(for: openChar))
1215+
1216+
// Perl only accept named references here, but Oniguruma and .NET
1217+
// also accepts numbered references. This shouldn't be an ambiguity
1218+
// as named references may not begin with a digit, '-', or '+'.
1219+
// Oniguruma also allows a recursion level to be specified.
1220+
return .backreference(try src.expectNamedOrNumberedReference(
1221+
endingWith: closing, allowRecursionLevel: true))
1222+
}
1223+
// Perl/.NET also allow a named references with the '{' delimiter.
1224+
if src.tryEat("{") {
11821225
return .backreference(
1183-
try src.expectNamedReference(endingWith: closing))
1226+
try src.expectNamedReference(endingWith: "}"))
11841227
}
11851228
return nil
11861229
}
@@ -1199,10 +1242,10 @@ extension Source {
11991242
// here.
12001243
if firstChar != "0", let numAndLoc = try src.lexNumber() {
12011244
let num = numAndLoc.value
1202-
let loc = numAndLoc.location
1245+
let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
12031246
if num < 10 || firstChar == "8" || firstChar == "9" ||
1204-
context.isPriorGroupRef(.absolute(num)) {
1205-
return .backreference(.init(.absolute(num), innerLoc: loc))
1247+
context.isPriorGroupRef(ref.kind) {
1248+
return .backreference(ref)
12061249
}
12071250
return nil
12081251
}
@@ -1225,7 +1268,6 @@ extension Source {
12251268
try recordLoc { src in
12261269
try src.tryEating { src in
12271270
guard src.tryEat(sequence: "(?") else { return nil }
1228-
let _start = src.currentPosition
12291271

12301272
// Note the below should be covered by canLexGroupLikeReference.
12311273

@@ -1243,8 +1285,7 @@ extension Source {
12431285
}
12441286

12451287
// Whole-pattern recursion, which is equivalent to (?0).
1246-
if src.tryEat("R") {
1247-
let loc = Location(_start ..< src.currentPosition)
1288+
if let loc = src.tryEatWithLoc("R") {
12481289
try src.expect(")")
12491290
return .subpattern(.init(.recurseWholePattern, innerLoc: loc))
12501291
}

Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ extension Source {
3636
}
3737
public var isFake: Bool { self == Self.fake }
3838
public var isReal: Bool { !isFake }
39+
40+
/// Returns the smallest location that contains both this location and
41+
/// another.
42+
public func union(with other: Location) -> SourceLocation {
43+
.init(min(start, other.start) ..< max(end, other.end))
44+
}
3945
}
4046
}
4147
public typealias SourceLocation = Source.Location

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,11 @@ extension AST.Atom.Callout: _ASTPrintable {
149149

150150
extension AST.Reference: _ASTPrintable {
151151
public var _dumpBase: String {
152-
"\(kind)"
152+
var result = "\(kind)"
153+
if let recursionLevel = recursionLevel {
154+
result += "\(recursionLevel.value)"
155+
}
156+
return result
153157
}
154158
}
155159

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,21 @@ func unsetMatchingOptions(
139139
unsetMatchingOptions(adding: adding)
140140
}
141141

142-
func ref(_ i: Int) -> AST.Reference {
143-
.init(.absolute(i), innerLoc: .fake)
142+
func ref(_ i: Int, recursionLevel: Int? = nil) -> AST.Reference {
143+
.init(.absolute(i), recursionLevel: recursionLevel.map { .init(faking: $0) },
144+
innerLoc: .fake)
144145
}
145-
func ref(plus n: Int) -> AST.Reference {
146-
.init(.relative(n), innerLoc: .fake)
146+
func ref(plus n: Int, recursionLevel: Int? = nil) -> AST.Reference {
147+
.init(.relative(n), recursionLevel: recursionLevel.map { .init(faking: $0) },
148+
innerLoc: .fake)
147149
}
148-
func ref(minus n: Int) -> AST.Reference {
149-
.init(.relative(-n), innerLoc: .fake)
150+
func ref(minus n: Int, recursionLevel: Int? = nil) -> AST.Reference {
151+
.init(.relative(-n), recursionLevel: recursionLevel.map { .init(faking: $0) },
152+
innerLoc: .fake)
150153
}
151-
func ref(_ s: String) -> AST.Reference {
152-
.init(.named(s), innerLoc: .fake)
154+
func ref(_ s: String, recursionLevel: Int? = nil) -> AST.Reference {
155+
.init(.named(s), recursionLevel: recursionLevel.map { .init(faking: $0) },
156+
innerLoc: .fake)
153157
}
154158
func conditional(
155159
_ cond: AST.Conditional.Condition.Kind, trueBranch: AST, falseBranch: AST
@@ -286,8 +290,10 @@ func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member {
286290
atom_m(.scalar(s))
287291
}
288292

289-
func backreference(_ r: AST.Reference.Kind) -> AST {
290-
atom(.backreference(.init(r, innerLoc: .fake)))
293+
func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST {
294+
atom(.backreference(.init(
295+
r, recursionLevel: recursionLevel.map { .init(faking: $0) }, innerLoc: .fake
296+
)))
291297
}
292298
func subpattern(_ r: AST.Reference.Kind) -> AST {
293299
atom(.subpattern(.init(r, innerLoc: .fake)))

0 commit comments

Comments
 (0)