Skip to content

Commit 192455b

Browse files
Add regexp/require-unicode-regexp rule (#331)
1 parent 4960281 commit 192455b

File tree

9 files changed

+485
-14
lines changed

9 files changed

+485
-14
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
155155
| [regexp/prefer-range](https://ota-meshi.github.io/eslint-plugin-regexp/rules/prefer-range.html) | enforce using character class range | :star::wrench: |
156156
| [regexp/prefer-regexp-exec](https://ota-meshi.github.io/eslint-plugin-regexp/rules/prefer-regexp-exec.html) | enforce that `RegExp#exec` is used instead of `String#match` if no global flag is provided | |
157157
| [regexp/prefer-regexp-test](https://ota-meshi.github.io/eslint-plugin-regexp/rules/prefer-regexp-test.html) | enforce that `RegExp#test` is used instead of `String#match` and `RegExp#exec` | :wrench: |
158+
| [regexp/require-unicode-regexp](https://ota-meshi.github.io/eslint-plugin-regexp/rules/require-unicode-regexp.html) | enforce the use of the `u` flag | :wrench: |
158159
| [regexp/sort-alternatives](https://ota-meshi.github.io/eslint-plugin-regexp/rules/sort-alternatives.html) | sort alternatives if order doesn't matter | :wrench: |
159160

160161
### Stylistic Issues

docs/rules/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
6464
| [regexp/prefer-range](./prefer-range.md) | enforce using character class range | :star::wrench: |
6565
| [regexp/prefer-regexp-exec](./prefer-regexp-exec.md) | enforce that `RegExp#exec` is used instead of `String#match` if no global flag is provided | |
6666
| [regexp/prefer-regexp-test](./prefer-regexp-test.md) | enforce that `RegExp#test` is used instead of `String#match` and `RegExp#exec` | :wrench: |
67+
| [regexp/require-unicode-regexp](./require-unicode-regexp.md) | enforce the use of the `u` flag | :wrench: |
6768
| [regexp/sort-alternatives](./sort-alternatives.md) | sort alternatives if order doesn't matter | :wrench: |
6869

6970
### Stylistic Issues

docs/rules/require-unicode-regexp.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
---
2+
pageClass: "rule-details"
3+
sidebarDepth: 0
4+
title: "regexp/require-unicode-regexp"
5+
description: "enforce the use of the `u` flag"
6+
---
7+
# regexp/require-unicode-regexp
8+
9+
> enforce the use of the `u` flag
10+
11+
- :exclamation: <badge text="This rule has not been released yet." vertical="middle" type="error"> ***This rule has not been released yet.*** </badge>
12+
- :wrench: The `--fix` option on the [command line](https://eslint.org/docs/user-guide/command-line-interface#fixing-problems) can automatically fix some of the problems reported by this rule.
13+
14+
## :book: Rule Details
15+
16+
This rule reports regular expressions without the `u` flag.
17+
18+
It will automatically add the `u` flag to regular expression where it is statically guaranteed to be safe to do so. In all other cases, the developer has to check that adding the `u` flag doesn't cause the regex to behave incorrectly.
19+
20+
This rule is inspired by the [require-unicode-regexp] rule. The position of the report is improved over the core rule and arguments of `new RegExp()` are also checked.
21+
22+
<eslint-code-block fix>
23+
24+
```js
25+
/* eslint regexp/require-unicode-regexp: "error" */
26+
27+
/* ✓ GOOD */
28+
var foo = /foo/u;
29+
var foo = /a\s+b/u;
30+
31+
/* ✗ BAD */
32+
var foo = /foo/;
33+
var foo = RegExp("a\\s+b");
34+
var foo = /[a-z]/i;
35+
var foo = /\S/;
36+
```
37+
38+
</eslint-code-block>
39+
40+
## :wrench: Options
41+
42+
Nothing.
43+
44+
## :books: Further reading
45+
46+
- [require-unicode-regexp]
47+
48+
[require-unicode-regexp]: https://eslint.org/docs/rules/require-unicode-regexp
49+
50+
## :mag: Implementation
51+
52+
- [Rule source](https://github.com/ota-meshi/eslint-plugin-regexp/blob/master/lib/rules/require-unicode-regexp.ts)
53+
- [Test source](https://github.com/ota-meshi/eslint-plugin-regexp/blob/master/tests/lib/rules/require-unicode-regexp.ts)

lib/rules/require-unicode-regexp.ts

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
import type { CharRange } from "refa"
2+
import { visitRegExpAST, RegExpParser } from "regexpp"
3+
import type { Pattern } from "regexpp/ast"
4+
import type { RegExpVisitor } from "regexpp/visitor"
5+
import type { RegExpContext } from "../utils"
6+
import { createRule, defineRegexpVisitor } from "../utils"
7+
import { hasSomeDescendant, toCache, toCharSet } from "regexp-ast-analysis"
8+
9+
const UTF16_MAX = 0xffff
10+
11+
/**
12+
* Returns whether the given pattern is compatible with unicode-mode on a
13+
* syntactical level. So means that:
14+
*
15+
* 1. The raw regex is syntactically valid with the u flag.
16+
* 2. The regex is parsed the same way (*).
17+
*
18+
* (*) Unicode mode parses surrogates as one character while non-Unicode mode
19+
* parses the pair as two separate code points. We will ignore this difference.
20+
* We will also ignore the sematic differences between escape sequences and
21+
* so on.
22+
*
23+
* @returns `false` or the parsed Unicode pattern
24+
*/
25+
function isSyntacticallyCompatible(pattern: Pattern): false | Pattern {
26+
const INCOMPATIBLE = {}
27+
28+
// See whether it's syntactically valid
29+
30+
let uPattern
31+
try {
32+
uPattern = new RegExpParser().parsePattern(
33+
pattern.raw,
34+
undefined,
35+
undefined,
36+
true,
37+
)
38+
} catch (error) {
39+
return false
40+
}
41+
42+
// See whether it's parsed the same way
43+
44+
// We will try to find constructs in the non-Unicode regex that we know
45+
// will either result in a syntax error or a different construct. Since
46+
// we already checked for syntax errors, we know that it's the second
47+
// option.
48+
49+
// There is another construct that get interpreted differently: Surrogates.
50+
// We want to make sure that no surrogate is a quantified element or
51+
// character class element.
52+
53+
try {
54+
visitRegExpAST(pattern, {
55+
onCharacterEnter(node) {
56+
if (/^\\(?![bfnrtv])[A-Za-z]$/.test(node.raw)) {
57+
// All cool Unicode feature are behind escapes like \p.
58+
throw INCOMPATIBLE
59+
}
60+
},
61+
})
62+
63+
// See no-misleading-character-class for more details
64+
visitRegExpAST(uPattern, {
65+
onCharacterEnter(node) {
66+
if (
67+
node.value > UTF16_MAX &&
68+
(node.parent.type === "CharacterClass" ||
69+
node.parent.type === "CharacterClassRange")
70+
) {
71+
// /[😃]/ != /[😃]/u
72+
throw INCOMPATIBLE
73+
}
74+
},
75+
onQuantifierEnter(node) {
76+
if (
77+
node.element.type === "Character" &&
78+
node.element.value > UTF16_MAX
79+
) {
80+
// /😃+/ != /😃+/u
81+
throw INCOMPATIBLE
82+
}
83+
},
84+
})
85+
} catch (error) {
86+
if (error === INCOMPATIBLE) {
87+
return false
88+
}
89+
// just rethrow
90+
throw error
91+
}
92+
93+
return uPattern
94+
}
95+
96+
const SURROGATES: CharRange = { min: 0xd800, max: 0xdfff }
97+
98+
/** Returns whether the two given ranges are equal. */
99+
function rangeEqual(a: readonly CharRange[], b: readonly CharRange[]): boolean {
100+
if (a.length !== b.length) {
101+
return false
102+
}
103+
for (let i = 0; i < a.length; i++) {
104+
const x = a[i]
105+
const y = b[i]
106+
if (x.min !== y.min || x.max !== y.max) {
107+
return false
108+
}
109+
}
110+
return true
111+
}
112+
113+
/**
114+
* Returns whether the regex would keep its behaviour if the u flag were to be
115+
* added.
116+
*/
117+
function isSemanticallyCompatible(
118+
regexpContext: RegExpContext,
119+
uPattern: Pattern,
120+
): boolean {
121+
const surrogatePositions = new Set<number>()
122+
visitRegExpAST(uPattern, {
123+
onCharacterEnter(node) {
124+
if (node.value > UTF16_MAX) {
125+
for (let i = node.start; i < node.end; i++) {
126+
surrogatePositions.add(i)
127+
}
128+
}
129+
},
130+
})
131+
132+
const pattern = regexpContext.patternAst
133+
const flags = regexpContext.flags
134+
const uFlags = toCache({ ...flags, unicode: true })
135+
136+
return !hasSomeDescendant(
137+
pattern,
138+
(n) => {
139+
// The goal is find something that is will change when adding the
140+
// Unicode flag.
141+
142+
// Surrogates don't change
143+
if (n.type === "Character" && surrogatePositions.has(n.start)) {
144+
return false
145+
}
146+
147+
if (
148+
n.type === "Assertion" &&
149+
n.kind === "word" &&
150+
flags.ignoreCase
151+
) {
152+
// The case canonicalization in Unicode mode is different which
153+
// causes `\b` and `\B` to accept/reject a few more characters.
154+
return true
155+
}
156+
157+
if (
158+
n.type === "Character" ||
159+
n.type === "CharacterClass" ||
160+
n.type === "CharacterSet"
161+
) {
162+
const cs = toCharSet(n, flags)
163+
if (!cs.isDisjointWith(SURROGATES)) {
164+
// If the character (class/set) contains high or low
165+
// surrogates, then we won't be able to guarantee that the
166+
// Unicode pattern will behave the same way.
167+
return true
168+
}
169+
170+
// Compare the ranges.
171+
return !rangeEqual(cs.ranges, toCharSet(n, uFlags).ranges)
172+
}
173+
174+
return false
175+
},
176+
(n) => {
177+
// Don't go into character classes, we already checked them
178+
return n.type !== "CharacterClass"
179+
},
180+
)
181+
}
182+
183+
/**
184+
* Returns whether the regex would keep its behaviour if the u flag were to be
185+
* added.
186+
*/
187+
function isCompatible(regexpContext: RegExpContext): boolean {
188+
const uPattern = isSyntacticallyCompatible(regexpContext.patternAst)
189+
if (!uPattern) {
190+
return false
191+
}
192+
193+
return isSemanticallyCompatible(regexpContext, uPattern)
194+
}
195+
196+
export default createRule("require-unicode-regexp", {
197+
meta: {
198+
docs: {
199+
description: "enforce the use of the `u` flag",
200+
category: "Best Practices",
201+
recommended: false,
202+
},
203+
schema: [],
204+
fixable: "code",
205+
messages: {
206+
require: "Use the 'u' flag.",
207+
},
208+
type: "suggestion", // "problem",
209+
},
210+
create(context) {
211+
/**
212+
* Create visitor
213+
*/
214+
function createVisitor(
215+
regexpContext: RegExpContext,
216+
): RegExpVisitor.Handlers {
217+
const {
218+
node,
219+
flags,
220+
flagsString,
221+
getFlagsLocation,
222+
fixReplaceFlags,
223+
} = regexpContext
224+
225+
if (flagsString === null) {
226+
// This means that there are flags (probably) but we were
227+
// unable to evaluate them.
228+
return {}
229+
}
230+
231+
if (!flags.unicode) {
232+
context.report({
233+
node,
234+
loc: getFlagsLocation(),
235+
messageId: "require",
236+
fix: fixReplaceFlags(() => {
237+
if (!isCompatible(regexpContext)) {
238+
return null
239+
}
240+
return `${flagsString}u`
241+
}),
242+
})
243+
}
244+
245+
return {}
246+
}
247+
248+
return defineRegexpVisitor(context, {
249+
createVisitor,
250+
})
251+
},
252+
})

0 commit comments

Comments
 (0)