Skip to content

Commit b454390

Browse files
committed
Fix CharacterClass.newlineSequence
Map to `.newlineSequence` instead of `.newline`, which allows it to create the correct consumer. rdar://96330096
1 parent 0ab3079 commit b454390

File tree

3 files changed

+154
-2
lines changed

3 files changed

+154
-2
lines changed

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -837,8 +837,7 @@ extension DSLTree {
837837
.init(ast: .init(.escaped(.horizontalWhitespace), .fake))
838838
}
839839
public static var _newlineSequence: Self {
840-
// FIXME: newline sequence is not same as \n
841-
.init(ast: .init(.escaped(.newline), .fake))
840+
.init(ast: .init(.escaped(.newlineSequence), .fake))
842841
}
843842
public static var _verticalWhitespace: Self {
844843
.init(ast: .init(.escaped(.verticalTab), .fake))

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,116 @@ class RegexDSLTests: XCTestCase {
110110
CharacterClass.whitespace.inverted
111111
}
112112
}
113+
114+
let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}"
115+
let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n"
116+
117+
// `.newlineSequence` and `.verticalWhitespace` match the same set of
118+
// newlines in grapheme semantic mode, and scalar mode when applied with
119+
// OneOrMore.
120+
for cc in [CharacterClass.newlineSequence, .verticalWhitespace] {
121+
for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] {
122+
try _testDSLCaptures(
123+
("\n", ("\n", "\n")),
124+
("\r", ("\r", "\r")),
125+
("\r\n", ("\r\n", "\r\n")),
126+
(allNewlines, (allNewlines[...], allNewlines[...])),
127+
("abc\ndef", ("abc\ndef", "\n")),
128+
("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")),
129+
("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])),
130+
("abc", nil),
131+
matchType: (Substring, Substring).self, ==)
132+
{
133+
Regex {
134+
ZeroOrMore {
135+
cc.inverted
136+
}
137+
Capture {
138+
OneOrMore(cc)
139+
}
140+
ZeroOrMore {
141+
cc.inverted
142+
}
143+
}.matchingSemantics(mode)
144+
}
145+
146+
// Try with ASCII-only whitespace.
147+
try _testDSLCaptures(
148+
("\n", ("\n", "\n")),
149+
("\r", ("\r", "\r")),
150+
("\r\n", ("\r\n", "\r\n")),
151+
(allNewlines, (allNewlines[...], asciiNewlines[...])),
152+
("abc\ndef", ("abc\ndef", "\n")),
153+
("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")),
154+
("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])),
155+
("abc", nil),
156+
matchType: (Substring, Substring).self, ==)
157+
{
158+
Regex {
159+
ZeroOrMore {
160+
cc.inverted
161+
}
162+
Capture {
163+
OneOrMore(cc)
164+
}
165+
ZeroOrMore {
166+
cc.inverted
167+
}
168+
}.matchingSemantics(mode).asciiOnlyWhitespace()
169+
}
170+
}
171+
}
172+
173+
// `.newlineSequence` in scalar mode may match a single `\r\n`.
174+
// `.verticalWhitespace` may not.
175+
for asciiOnly in [true, false] {
176+
try _testDSLCaptures(
177+
("\r", "\r"),
178+
("\r\n", "\r\n"),
179+
matchType: Substring.self, ==)
180+
{
181+
Regex {
182+
CharacterClass.newlineSequence
183+
}.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly)
184+
}
185+
try _testDSLCaptures(
186+
("\r", nil),
187+
("\r\n", nil),
188+
matchType: Substring.self, ==)
189+
{
190+
Regex {
191+
CharacterClass.newlineSequence.inverted
192+
}.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly)
193+
}
194+
try _testDSLCaptures(
195+
("\r", "\r"),
196+
("\r\n", nil),
197+
matchType: Substring.self, ==)
198+
{
199+
Regex {
200+
CharacterClass.verticalWhitespace
201+
}.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly)
202+
}
203+
try _testDSLCaptures(
204+
("\r", nil),
205+
("\r\n", nil),
206+
matchType: Substring.self, ==)
207+
{
208+
Regex {
209+
CharacterClass.verticalWhitespace.inverted
210+
}.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly)
211+
}
212+
try _testDSLCaptures(
213+
("\r", nil),
214+
("\r\n", nil),
215+
matchType: Substring.self, ==)
216+
{
217+
Regex {
218+
CharacterClass.verticalWhitespace.inverted
219+
"\n"
220+
}.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly)
221+
}
222+
}
113223
}
114224

115225
func testCharacterClassOperations() throws {

Tests/RegexTests/MatchTests.swift

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,49 @@ extension RegexTests {
634634
("\n", true),
635635
("\r", true))
636636

637+
let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}"
638+
let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n"
639+
640+
for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] {
641+
firstMatchTest(
642+
#"\R+"#,
643+
input: "abc\(allNewlines)def", match: allNewlines,
644+
semanticLevel: level
645+
)
646+
firstMatchTest(
647+
#"\v+"#,
648+
input: "abc\(allNewlines)def", match: allNewlines,
649+
semanticLevel: level
650+
)
651+
}
652+
653+
// In scalar mode, \R can match \r\n, \v cannot.
654+
firstMatchTest(
655+
#"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar)
656+
firstMatchTest(
657+
#"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar)
658+
firstMatchTest(
659+
#"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar)
660+
firstMatchTest(
661+
#"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar)
662+
663+
// ASCII-only spaces.
664+
firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines)
665+
firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines)
666+
firstMatchTest(
667+
#"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar)
668+
firstMatchTest(
669+
#"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar)
670+
671+
matchTest(
672+
#"[a]\u0301"#,
673+
("a\u{301}", false),
674+
semanticLevel: .graphemeCluster)
675+
matchTest(
676+
#"[a]\u0301"#,
677+
("a\u{301}", true),
678+
semanticLevel: .unicodeScalar)
679+
637680
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
638681

639682
// These are metacharacters in certain contexts, but normal characters

0 commit comments

Comments
 (0)