Skip to content

Commit 451a9ac

Browse files
committed
Treat range quantifiers as literal if invalid
This matches the PCRE and Oniguruma behavior. Also make sure the matching engine can handle a 0 range.
1 parent a9a71c4 commit 451a9ac

File tree

4 files changed

+96
-49
lines changed

4 files changed

+96
-49
lines changed

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 60 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,22 @@ extension Source {
113113
try tryEatNonEmpty(sequence: String(c))
114114
}
115115

116+
/// Attempt to make a series of lexing steps in `body`, returning `nil` if
117+
/// unsuccesful, which will revert the source back to its previous state. If
118+
/// an error is thrown, the source will not be reverted.
119+
mutating func tryEating<T>(
120+
_ body: (inout Source) throws -> T?
121+
) rethrows -> T? {
122+
// We don't revert the source if an error is thrown, as it's useful to
123+
// maintain the source location in that case.
124+
let current = self
125+
guard let result = try body(&self) else {
126+
self = current
127+
return nil
128+
}
129+
return result
130+
}
131+
116132
/// Throws an expected ASCII character error if not matched
117133
mutating func expectASCII() throws -> Located<Character> {
118134
try recordLoc { src in
@@ -296,16 +312,11 @@ extension Source {
296312
if src.tryEat("+") { return .oneOrMore }
297313
if src.tryEat("?") { return .zeroOrOne }
298314

299-
// FIXME: Actually, PCRE treats empty as literal `{}`...
300-
// But Java 8 errors out?
301-
if src.tryEat("{") {
302-
// FIXME: Erm, PCRE parses as literal if no lowerbound...
303-
let amt = try src.expectRange()
304-
try src.expect("}")
305-
return amt.value // FIXME: track actual range...
315+
return try src.tryEating { src in
316+
guard src.tryEat("{"), let range = try src.lexRange(), src.tryEat("}")
317+
else { return nil }
318+
return range.value
306319
}
307-
308-
return nil
309320
}
310321
guard let amt = amt else { return nil }
311322

@@ -318,56 +329,56 @@ extension Source {
318329
return (amt, kind)
319330
}
320331

321-
/// Consume a range
332+
/// Try to consume a range, returning `nil` if unsuccessful.
322333
///
323334
/// Range -> ',' <Int> | <Int> ',' <Int>? | <Int>
324335
/// | ExpRange
325336
/// ExpRange -> '..<' <Int> | '...' <Int>
326337
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
327-
mutating func expectRange() throws -> Located<Quant.Amount> {
338+
mutating func lexRange() throws -> Located<Quant.Amount>? {
328339
try recordLoc { src in
329-
// TODO: lex positive numbers, more specifically...
330-
331-
let lowerOpt = try src.lexNumber()
332-
333-
// ',' or '...' or '..<' or nothing
334-
let closedRange: Bool?
335-
if src.tryEat(",") {
336-
closedRange = true
337-
} else if src.experimentalRanges && src.tryEat(".") {
338-
try src.expect(".")
339-
if src.tryEat(".") {
340+
try src.tryEating { src in
341+
let lowerOpt = try src.lexNumber()
342+
343+
// ',' or '...' or '..<' or nothing
344+
// TODO: We ought to try and consume whitespace here and emit a
345+
// diagnostic for the user warning them that it would cause the range to
346+
// be treated as literal.
347+
let closedRange: Bool?
348+
if src.tryEat(",") {
340349
closedRange = true
350+
} else if src.experimentalRanges && src.tryEat(".") {
351+
try src.expect(".")
352+
if src.tryEat(".") {
353+
closedRange = true
354+
} else {
355+
try src.expect("<")
356+
closedRange = false
357+
}
341358
} else {
342-
try src.expect("<")
343-
closedRange = false
359+
closedRange = nil
344360
}
345-
} else {
346-
closedRange = nil
347-
}
348-
// FIXME: wait, why `try!` ?
349-
let upperOpt: Located<Int>?
350-
if let u = try! src.lexNumber() {
351-
upperOpt = (closedRange == true) ? u : Located(u.value-1, u.location)
352-
} else {
353-
upperOpt = nil
354-
}
355361

356-
switch (lowerOpt, closedRange, upperOpt) {
357-
case let (l?, nil, nil):
358-
return .exactly(l)
359-
case let (l?, true, nil):
360-
return .nOrMore(l)
361-
case let (nil, _, u?):
362-
return .upToN(u)
363-
case let (l?, _, u?):
364-
// FIXME: source location tracking
365-
return .range(l, u)
366-
367-
case let (nil, nil, u) where u != nil:
368-
fatalError("Not possible")
369-
default:
370-
throw ParseError.misc("Invalid range")
362+
let upperOpt = try src.lexNumber()?.map { upper in
363+
// If we have an open range, the upper bound should be adjusted down.
364+
closedRange == true ? upper : upper - 1
365+
}
366+
367+
switch (lowerOpt, closedRange, upperOpt) {
368+
case let (l?, nil, nil):
369+
return .exactly(l)
370+
case let (l?, true, nil):
371+
return .nOrMore(l)
372+
case let (nil, _?, u?):
373+
return .upToN(u)
374+
case let (l?, _?, u?):
375+
return .range(l, u)
376+
377+
case (nil, nil, _?):
378+
fatalError("Didn't lex lower bound, but lexed upper bound?")
379+
default:
380+
return nil
381+
}
371382
}
372383
}
373384
}

Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ extension Source {
7676
// externally?
7777
self.init(v, .fake)
7878
}
79+
80+
public func map<U>(_ fn: (T) throws -> U) rethrows -> Located<U> {
81+
Located<U>(try fn(value), location)
82+
}
7983
}
8084
}
8185
extension AST {

Tests/RegexTests/MatchTests.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ extension RegexTests {
181181
#"a{1,2}?"#, input: "123aaaxyz", match: "a")
182182
matchTest(
183183
#"a{1,2}?x"#, input: "123aaaxyz", match: "aax")
184+
matchTest(
185+
#"xa{0}y"#, input: "123aaaxyz", match: "xy")
186+
matchTest(
187+
#"xa{0,0}y"#, input: "123aaaxyz", match: "xy")
184188

185189
matchTest("a.*", input: "dcba", match: "a")
186190

Tests/RegexTests/ParseTests.swift

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,34 @@ extension RegexTests {
448448
parseTest(
449449
#"a{1,2}?"#,
450450
quantRange(.reluctant, 1...2, "a"))
451+
parseTest(
452+
#"a{0}"#,
453+
exactly(.eager, 0, "a"))
454+
parseTest(
455+
#"a{0,0}"#,
456+
quantRange(.eager, 0...0, "a"))
457+
458+
// Make sure ranges get treated as literal if invalid.
459+
parseTest("{", "{")
460+
parseTest("{,", concat("{", ","))
461+
parseTest("{}", concat("{", "}"))
462+
parseTest("{,}", concat("{", ",", "}"))
463+
parseTest("{,6", concat("{", ",", "6"))
464+
parseTest("{6", concat("{", "6"))
465+
parseTest("{6,", concat("{", "6", ","))
466+
parseTest("{+", oneOrMore(.eager, "{"))
467+
parseTest("{6,+", concat("{", "6", oneOrMore(.eager, ",")))
468+
parseTest("x{", concat("x", "{"))
469+
parseTest("x{}", concat("x", "{", "}"))
470+
parseTest("x{,}", concat("x", "{", ",", "}"))
471+
parseTest("x{,6", concat("x", "{", ",", "6"))
472+
parseTest("x{6", concat("x", "{", "6"))
473+
parseTest("x{6,", concat("x", "{", "6", ","))
474+
parseTest("x{+", concat("x", oneOrMore(.eager, "{")))
475+
parseTest("x{6,+", concat("x", "{", "6", oneOrMore(.eager, ",")))
476+
477+
// TODO: We should emit a diagnostic for this.
478+
parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}"))
451479

452480
// MARK: Groups
453481

0 commit comments

Comments
 (0)