Skip to content

Commit 1432da0

Browse files
Improve regexp/require-unicode-regexp fixes (#352)
* Improve `require-unicode-regexp` fixes * Fixed doc comment
1 parent bdb63bc commit 1432da0

File tree

2 files changed

+159
-18
lines changed

2 files changed

+159
-18
lines changed

lib/rules/require-unicode-regexp.ts

Lines changed: 138 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
11
import type { CharRange } from "refa"
22
import { visitRegExpAST, RegExpParser } from "regexpp"
3-
import type { Pattern } from "regexpp/ast"
3+
import type {
4+
Character,
5+
CharacterClass,
6+
CharacterSet,
7+
Node,
8+
Pattern,
9+
Quantifier,
10+
} from "regexpp/ast"
411
import type { RegExpVisitor } from "regexpp/visitor"
512
import type { RegExpContext } from "../utils"
613
import { createRule, defineRegexpVisitor } from "../utils"
7-
import { hasSomeDescendant, toCache, toCharSet } from "regexp-ast-analysis"
14+
import type { ReadonlyFlags } from "regexp-ast-analysis"
15+
import {
16+
hasSomeDescendant,
17+
toCache,
18+
toCharSet,
19+
getFirstCharAfter,
20+
} from "regexp-ast-analysis"
821

922
const UTF16_MAX = 0xffff
1023

@@ -93,7 +106,10 @@ function isSyntacticallyCompatible(pattern: Pattern): false | Pattern {
93106
return uPattern
94107
}
95108

109+
const HIGH_SURROGATES: CharRange = { min: 0xd800, max: 0xdbff }
110+
const LOW_SURROGATES: CharRange = { min: 0xdc00, max: 0xdfff }
96111
const SURROGATES: CharRange = { min: 0xd800, max: 0xdfff }
112+
const ASTRAL: CharRange = { min: 0x10000, max: 0x10ffff }
97113

98114
/** Returns whether the two given ranges are equal. */
99115
function rangeEqual(a: readonly CharRange[], b: readonly CharRange[]): boolean {
@@ -110,6 +126,111 @@ function rangeEqual(a: readonly CharRange[], b: readonly CharRange[]): boolean {
110126
return true
111127
}
112128

129+
type CharLike = Character | CharacterClass | CharacterSet
130+
131+
/** Whether the given element is character-like element. */
132+
function isChar(node: Node): node is CharLike {
133+
return (
134+
node.type === "Character" ||
135+
node.type === "CharacterClass" ||
136+
node.type === "CharacterSet"
137+
)
138+
}
139+
140+
/**
141+
* Whether the given char-like accepts the same characters with and without
142+
* the u flag.
143+
*/
144+
function isCompatibleCharLike(
145+
char: CharLike,
146+
flags: ReadonlyFlags,
147+
uFlags: ReadonlyFlags,
148+
): boolean {
149+
const cs = toCharSet(char, flags)
150+
if (!cs.isDisjointWith(SURROGATES)) {
151+
// If the character (class/set) contains high or low
152+
// surrogates, then we won't be able to guarantee that the
153+
// Unicode pattern will behave the same way.
154+
return false
155+
}
156+
157+
const uCs = toCharSet(char, uFlags)
158+
159+
// Compare the ranges.
160+
return rangeEqual(cs.ranges, uCs.ranges)
161+
}
162+
163+
/**
164+
* Whether the given quantifier accepts the same characters with and without
165+
* the u flag.
166+
*
167+
* This will return `undefined` if the function cannot decide.
168+
*/
169+
function isCompatibleQuantifier(
170+
q: Quantifier,
171+
flags: ReadonlyFlags,
172+
uFlags: ReadonlyFlags,
173+
): boolean | undefined {
174+
if (!isChar(q.element)) {
175+
return undefined
176+
}
177+
178+
if (isCompatibleCharLike(q.element, flags, uFlags)) {
179+
// trivial
180+
return true
181+
}
182+
183+
// A quantifier `n*` or `n+` is the same with and without the
184+
// u flag if all of the following conditions are true:
185+
//
186+
// 1. The UTF16 characters of the element contain all
187+
// surrogates characters (U+D800-U+DFFF).
188+
// 2. The Unicode characters of the element contain all
189+
// surrogates characters (U+D800-U+DFFF) and astral
190+
// characters (U+10000-U+10FFFF).
191+
// 3. All non-surrogate and non-astral characters of the UTF16
192+
// and Unicode characters of the element as the same.
193+
// 4. The first character before the quantifier is not a
194+
// high surrogate (U+D800-U+DBFF).
195+
// 5. The first character after the quantifier is not a
196+
// low surrogate (U+DC00-U+DFFF).
197+
198+
if (q.min > 1 || q.max !== Infinity) {
199+
return undefined
200+
}
201+
202+
const cs = toCharSet(q.element, flags)
203+
if (!cs.isSupersetOf(SURROGATES)) {
204+
// failed condition 1
205+
return false
206+
}
207+
208+
const uCs = toCharSet(q.element, uFlags)
209+
if (!uCs.isSupersetOf(SURROGATES) || !uCs.isSupersetOf(ASTRAL)) {
210+
// failed condition 2
211+
return false
212+
}
213+
214+
if (!rangeEqual(cs.ranges, uCs.without([ASTRAL]).ranges)) {
215+
// failed condition 3
216+
return false
217+
}
218+
219+
const before = getFirstCharAfter(q, "rtl", flags).char
220+
if (!before.isDisjointWith(HIGH_SURROGATES)) {
221+
// failed condition 4
222+
return false
223+
}
224+
225+
const after = getFirstCharAfter(q, "ltr", flags).char
226+
if (!after.isDisjointWith(LOW_SURROGATES)) {
227+
// failed condition 5
228+
return false
229+
}
230+
231+
return true
232+
}
233+
113234
/**
114235
* Returns whether the regex would keep its behaviour if the u flag were to be
115236
* added.
@@ -133,6 +254,8 @@ function isSemanticallyCompatible(
133254
const flags = regexpContext.flags
134255
const uFlags = toCache({ ...flags, unicode: true })
135256

257+
const skip = new Set<Node>()
258+
136259
return !hasSomeDescendant(
137260
pattern,
138261
(n) => {
@@ -154,28 +277,25 @@ function isSemanticallyCompatible(
154277
return true
155278
}
156279

157-
if (
158-
n.type === "Character" ||
159-
n.type === "CharacterClass" ||
160-
n.type === "CharacterSet"
161-
) {
162-
const cs = toCharSet(n, flags)
163-
if (!cs.isDisjointWith(SURROGATES)) {
164-
// If the character (class/set) contains high or low
165-
// surrogates, then we won't be able to guarantee that the
166-
// Unicode pattern will behave the same way.
167-
return true
168-
}
280+
if (isChar(n)) {
281+
return !isCompatibleCharLike(n, flags, uFlags)
282+
}
283+
284+
if (n.type === "Quantifier") {
285+
const result = isCompatibleQuantifier(n, flags, uFlags)
169286

170-
// Compare the ranges.
171-
return !rangeEqual(cs.ranges, toCharSet(n, uFlags).ranges)
287+
if (result !== undefined) {
288+
skip.add(n)
289+
return !result
290+
}
172291
}
173292

174293
return false
175294
},
176295
(n) => {
177-
// Don't go into character classes, we already checked them
178-
return n.type !== "CharacterClass"
296+
// Don't go into character classes, we already checked them.
297+
// We also don't want to go into elements, we explicitly skipped.
298+
return n.type !== "CharacterClass" && !skip.has(n)
179299
},
180300
)
181301
}

tests/lib/rules/require-unicode-regexp.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,21 @@ tester.run("require-unicode-regexp", rule as any, {
9999
output: String.raw`/foo/iu`,
100100
errors: 1,
101101
},
102+
{
103+
code: String.raw`/ab+c/`,
104+
output: String.raw`/ab+c/u`,
105+
errors: 1,
106+
},
107+
{
108+
code: String.raw`/a.*b/`,
109+
output: String.raw`/a.*b/u`,
110+
errors: 1,
111+
},
112+
{
113+
code: String.raw`/<[^<>]+>/`,
114+
output: String.raw`/<[^<>]+>/u`,
115+
errors: 1,
116+
},
102117
{
103118
// "k" maps to 3 characters in ignore-case Unicode mode
104119
code: String.raw`/k/i`,
@@ -132,5 +147,11 @@ tester.run("require-unicode-regexp", rule as any, {
132147
output: null,
133148
errors: 1,
134149
},
150+
{
151+
// "<😃>" is accepted by one but not the other
152+
code: String.raw`/<[^<>]>/`,
153+
output: null,
154+
errors: 1,
155+
},
135156
],
136157
})

0 commit comments

Comments
 (0)