Skip to content

Commit 4d5f716

Browse files
committed
Allow unbounded quoted sequences \Q...
PCRE and ICU both support quoted sequences that don't have a terminating `\E`. Update the parsing to allow this. Additionally, allow empty quoted sequences outside of custom character classes, which is consistent with ICU. Finally, don't allow quoted sequences to span multiple lines in extended syntax literals.
1 parent 62195a1 commit 4d5f716

File tree

4 files changed

+60
-6
lines changed

4 files changed

+60
-6
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ enum ParseError: Error, Hashable {
4444
case invalidEscape(Character)
4545
case confusableCharacter(Character)
4646

47+
case quoteMayNotSpanMultipleLines
48+
4749
case cannotReferToWholePattern
4850

4951
case quantifierRequiresOperand(String)
@@ -139,6 +141,8 @@ extension ParseError: CustomStringConvertible {
139141
return "invalid escape sequence '\\\(c)'"
140142
case .confusableCharacter(let c):
141143
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
144+
case .quoteMayNotSpanMultipleLines:
145+
return "quoted sequence may not span multiple lines in multi-line literal"
142146
case .cannotReferToWholePattern:
143147
return "cannot refer to whole pattern here"
144148
case .quantifierRequiresOperand(let q):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ extension Source {
579579

580580
/// Try to consume quoted content
581581
///
582-
/// Quote -> '\Q' (!'\E' .)* '\E'
582+
/// Quote -> '\Q' (!'\E' .)* '\E'?
583583
///
584584
/// With `SyntaxOptions.experimentalQuotes`, also accepts
585585
///
@@ -592,9 +592,24 @@ extension Source {
592592
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
593593
let str = try recordLoc { src -> String? in
594594
if src.tryEat(sequence: #"\Q"#) {
595-
return try src.expectQuoted(endingWith: #"\E"#).value
595+
let contents = src.lexUntil { src in
596+
src.isEmpty || src.tryEat(sequence: #"\E"#)
597+
}.value
598+
599+
// In multi-line literals, the quote may not span multiple lines.
600+
if context.syntax.contains(.multilineExtendedSyntax),
601+
contents.spansMultipleLinesInRegexLiteral {
602+
throw ParseError.quoteMayNotSpanMultipleLines
603+
}
604+
605+
// The sequence must not be empty in a custom character class.
606+
if context.isInCustomCharacterClass && contents.isEmpty {
607+
throw ParseError.expectedNonEmptyContents
608+
}
609+
return contents
596610
}
597611
if context.experimentalQuotes, src.tryEat("\"") {
612+
// TODO: Can experimental quotes be empty?
598613
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
599614
}
600615
return nil

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,13 @@ public func parse<S: StringProtocol>(
592592
return ast
593593
}
594594

595+
extension String {
596+
/// Whether the given string is considered multi-line for a regex literal.
597+
var spansMultipleLinesInRegexLiteral: Bool {
598+
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
599+
}
600+
}
601+
595602
/// Retrieve the default set of syntax options that a delimiter and literal
596603
/// contents indicates.
597604
fileprivate func defaultSyntaxOptions(
@@ -601,8 +608,7 @@ fileprivate func defaultSyntaxOptions(
601608
case .forwardSlash:
602609
// For an extended syntax forward slash e.g #/.../#, extended syntax is
603610
// permitted if it spans multiple lines.
604-
if delim.poundCount > 0 &&
605-
contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
611+
if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
606612
return .multilineExtendedSyntax
607613
}
608614
return .traditional

Tests/RegexTests/ParseTests.swift

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,14 @@ extension RegexTests {
754754
// This follows the PCRE behavior.
755755
parseTest(#"\Q\\E"#, quote("\\"))
756756

757+
// ICU allows quotes to be empty outside of custom character classes.
758+
parseTest(#"\Q\E"#, quote(""))
759+
760+
// Quotes may be unterminated.
761+
parseTest(#"\Qab"#, quote("ab"))
762+
parseTest(#"\Q"#, quote(""))
763+
parseTest("\\Qab\\", quote("ab\\"))
764+
757765
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
758766
syntax: .experimental)
759767
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
@@ -2592,8 +2600,6 @@ extension RegexTests {
25922600
diagnosticTest(#"(?P"#, .expected(")"))
25932601
diagnosticTest(#"(?R"#, .expected(")"))
25942602

2595-
diagnosticTest(#"\Qab"#, .expected("\\E"))
2596-
diagnosticTest("\\Qab\\", .expected("\\E"))
25972603
diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental)
25982604
diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental)
25992605
diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental)
@@ -2672,6 +2678,9 @@ extension RegexTests {
26722678
// TODO: Custom diagnostic for missing '\Q'
26732679
diagnosticTest(#"\E"#, .invalidEscape("E"))
26742680

2681+
diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents)
2682+
diagnosticTest(#"[\Q]"#, .expected("]"))
2683+
26752684
// PCRE treats these as octal, but we require a `0` prefix.
26762685
diagnosticTest(#"[\1]"#, .invalidEscape("1"))
26772686
diagnosticTest(#"[\123]"#, .invalidEscape("1"))
@@ -2767,6 +2776,26 @@ extension RegexTests {
27672776
""", .cannotRemoveExtendedSyntaxInMultilineMode
27682777
)
27692778

2779+
diagnosticWithDelimitersTest(#"""
2780+
#/
2781+
\Q
2782+
\E
2783+
/#
2784+
"""#, .quoteMayNotSpanMultipleLines)
2785+
2786+
diagnosticWithDelimitersTest(#"""
2787+
#/
2788+
\Qabc
2789+
\E
2790+
/#
2791+
"""#, .quoteMayNotSpanMultipleLines)
2792+
2793+
diagnosticWithDelimitersTest(#"""
2794+
#/
2795+
\Q
2796+
/#
2797+
"""#, .quoteMayNotSpanMultipleLines)
2798+
27702799
// MARK: Group specifiers
27712800

27722801
diagnosticTest(#"(*"#, .unknownGroupKind("*"))

0 commit comments

Comments
 (0)