Skip to content

Commit e8df271

Browse files
committed
Introduce ParsingContext
Subsume the individual contextual lexing parameters with a ParsingContext type that tracks the number and names of groups, in addition to whether we're parsing within a custom character class.
1 parent 67d97e5 commit e8df271

File tree

3 files changed

+79
-42
lines changed

3 files changed

+79
-42
lines changed

Sources/_MatchingEngine/Regex/AST/Group.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ extension AST.Group.Kind {
9898
return false
9999
}
100100
}
101+
102+
/// If this is a named group, its name, `nil` otherwise.
103+
public var name: String? {
104+
switch self {
105+
case .namedCapture(let name): return name.value
106+
default: return nil
107+
}
108+
}
101109
}
102110

103111
extension AST.Group {

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,7 @@ extension Source {
936936
/// | [1-9] [0-9]+
937937
///
938938
private mutating func lexEscapedReference(
939-
priorGroupCount: Int
939+
context: ParsingContext
940940
) throws -> Located<AST.Atom.Kind>? {
941941
try recordLoc { src in
942942
try src.tryEating { src in
@@ -989,7 +989,7 @@ extension Source {
989989
let num = numAndLoc.value
990990
let loc = numAndLoc.location
991991
if num < 10 || firstChar == "8" || firstChar == "9" ||
992-
num <= priorGroupCount {
992+
context.isPriorGroupRef(.absolute(num)) {
993993
return .backreference(.init(.absolute(num), innerLoc: loc))
994994
}
995995
return nil
@@ -1066,9 +1066,11 @@ extension Source {
10661066
/// | EscapedReference
10671067
///
10681068
mutating func expectEscaped(
1069-
isInCustomCharacterClass ccc: Bool, priorGroupCount: Int
1069+
context: ParsingContext
10701070
) throws -> Located<AST.Atom.Kind> {
10711071
try recordLoc { src in
1072+
let ccc = context.isInCustomCharacterClass
1073+
10721074
// Keyboard control/meta
10731075
if src.tryEat("c") || src.tryEat(sequence: "C-") {
10741076
return .keyboardControl(try src.expectASCII().value)
@@ -1092,9 +1094,7 @@ extension Source {
10921094

10931095
// References using escape syntax, e.g \1, \g{1}, \k<...>, ...
10941096
// These are not valid inside custom character classes.
1095-
if !ccc, let ref = try src.lexEscapedReference(
1096-
priorGroupCount: priorGroupCount
1097-
)?.value {
1097+
if !ccc, let ref = try src.lexEscapedReference(context: context)?.value {
10981098
return ref
10991099
}
11001100

@@ -1132,9 +1132,8 @@ extension Source {
11321132
///
11331133
/// ExpGroupStart -> '(_:'
11341134
///
1135-
mutating func lexAtom(
1136-
isInCustomCharacterClass customCC: Bool, priorGroupCount: Int
1137-
) throws -> AST.Atom? {
1135+
mutating func lexAtom(context: ParsingContext) throws -> AST.Atom? {
1136+
let customCC = context.isInCustomCharacterClass
11381137
let kind: Located<AST.Atom.Kind>? = try recordLoc { src in
11391138
// Check for not-an-atom, e.g. parser recursion termination
11401139
if src.isEmpty { return nil }
@@ -1168,9 +1167,7 @@ extension Source {
11681167
case "$": return customCC ? .char("$") : .endOfLine
11691168

11701169
// Escaped
1171-
case "\\": return try src.expectEscaped(
1172-
isInCustomCharacterClass: customCC,
1173-
priorGroupCount: priorGroupCount).value
1170+
case "\\": return try src.expectEscaped(context: context).value
11741171

11751172
case "]":
11761173
assert(!customCC, "parser should have prevented this")
@@ -1186,7 +1183,7 @@ extension Source {
11861183
/// Try to lex the end of a range in a custom character class, which consists
11871184
/// of a '-' character followed by an atom.
11881185
mutating func lexCustomCharClassRangeEnd(
1189-
priorGroupCount: Int
1186+
context: ParsingContext
11901187
) throws -> (dashLoc: SourceLocation, AST.Atom)? {
11911188
// Make sure we don't have a binary operator e.g '--', and the '-' is not
11921189
// ending the custom character class (in which case it is literal).
@@ -1195,8 +1192,7 @@ extension Source {
11951192
return nil
11961193
}
11971194
let dashLoc = Location(start ..< currentPosition)
1198-
guard let end = try lexAtom(isInCustomCharacterClass: true,
1199-
priorGroupCount: priorGroupCount) else {
1195+
guard let end = try lexAtom(context: context) else {
12001196
return nil
12011197
}
12021198
return (dashLoc, end)

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 60 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -51,28 +51,52 @@ Lexical analysis provides the following:
5151

5252
*/
5353

54-
private struct Parser {
55-
var source: Source
56-
57-
/// Tracks the number of parent custom character classes to allow us to
58-
/// determine whether or not to lex with custom character class syntax.
59-
fileprivate var customCharacterClassDepth = 0
54+
struct ParsingContext {
55+
/// Whether we're currently parsing in a custom character class.
56+
var isInCustomCharacterClass = false
6057

6158
/// Tracks the number of group openings we've seen, to disambiguate the '\n'
6259
/// syntax as a backreference or an octal sequence.
6360
fileprivate var priorGroupCount = 0
6461

62+
/// A set of used group names.
63+
fileprivate var usedGroupNames = Set<String>()
64+
65+
fileprivate mutating func recordGroup(_ g: AST.Group.Kind) {
66+
// TODO: Needs to track group number resets (?|...).
67+
priorGroupCount += 1
68+
if let name = g.name {
69+
usedGroupNames.insert(name)
70+
}
71+
}
72+
73+
private init() {}
74+
static var none: ParsingContext { .init() }
75+
76+
/// Check whether a given reference refers to a prior group.
77+
func isPriorGroupRef(_ ref: AST.Atom.Reference.Kind) -> Bool {
78+
switch ref {
79+
case .absolute(let i):
80+
return i <= priorGroupCount
81+
case .relative(let i):
82+
return i < 0
83+
case .named(let str):
84+
return usedGroupNames.contains(str)
85+
}
86+
}
87+
}
88+
89+
private struct Parser {
90+
var source: Source
91+
var context: ParsingContext = .none
92+
6593
init(_ source: Source) {
6694
self.source = source
6795
}
6896
}
6997

7098
// Diagnostics
7199
extension Parser {
72-
private var isInCustomCharacterClass: Bool {
73-
customCharacterClassDepth > 0
74-
}
75-
76100
mutating func report(
77101
_ str: String, _ function: String = #function, _ line: Int = #line
78102
) throws -> Never {
@@ -172,6 +196,20 @@ extension Parser {
172196
return .concatenation(.init(result, loc(_start)))
173197
}
174198

199+
/// Perform a recursive parse for the body of a group.
200+
mutating func parseGroupBody(
201+
start: Source.Position, _ kind: AST.Located<AST.Group.Kind>
202+
) throws -> AST.Group {
203+
context.recordGroup(kind.value)
204+
205+
let child = try parse()
206+
// An implicit scoped group has already consumed its closing paren.
207+
if !kind.value.hasImplicitScope {
208+
try source.expect(")")
209+
}
210+
return .init(kind, child, loc(start))
211+
}
212+
175213
/// Parse a (potentially quantified) component
176214
///
177215
/// QuantOperand -> Group | CustomCharClass | Atom
@@ -182,24 +220,18 @@ extension Parser {
182220

183221
let _start = source.currentPosition
184222

223+
// Check if we have the start of a group '('.
185224
if let kind = try source.lexGroupStart() {
186-
priorGroupCount += 1
187-
let child = try parse()
188-
// An implicit scoped group has already consumed its closing paren.
189-
if !kind.value.hasImplicitScope {
190-
try source.expect(")")
191-
}
192-
return .group(.init(kind, child, loc(_start)))
225+
return .group(try parseGroupBody(start: _start, kind))
193226
}
227+
228+
// Check if we have the start of a custom character class '['.
194229
if let cccStart = try source.lexCustomCCStart() {
195230
return .customCharacterClass(
196231
try parseCustomCharacterClass(cccStart))
197232
}
198233

199-
if let atom = try source.lexAtom(
200-
isInCustomCharacterClass: isInCustomCharacterClass,
201-
priorGroupCount: priorGroupCount
202-
) {
234+
if let atom = try source.lexAtom(context: context) {
203235
// TODO: track source locations
204236
return .atom(atom)
205237
}
@@ -224,6 +256,10 @@ extension Parser {
224256
mutating func parseCustomCharacterClass(
225257
_ start: Source.Located<CustomCC.Start>
226258
) throws -> CustomCC {
259+
let alreadyInCCC = context.isInCustomCharacterClass
260+
context.isInCustomCharacterClass = true
261+
defer { context.isInCustomCharacterClass = alreadyInCCC }
262+
227263
typealias Member = CustomCC.Member
228264
try source.expectNonEmpty()
229265

@@ -279,14 +315,11 @@ extension Parser {
279315
continue
280316
}
281317

282-
guard let atom = try source.lexAtom(
283-
isInCustomCharacterClass: true, priorGroupCount: priorGroupCount)
284-
else { break }
318+
guard let atom = try source.lexAtom(context: context) else { break }
285319

286320
// Range between atoms.
287-
if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(
288-
priorGroupCount: priorGroupCount
289-
) {
321+
if let (dashLoc, rhs) =
322+
try source.lexCustomCharClassRangeEnd(context: context) {
290323
guard atom.literalCharacterValue != nil &&
291324
rhs.literalCharacterValue != nil else {
292325
throw ParseError.invalidCharacterClassRangeOperand

0 commit comments

Comments
 (0)