Skip to content

Commit 40273b3

Browse files
authored
Improve regexp/sort-alternatives rule to add support for string alternatives and v flag (#587)
1 parent 95643c0 commit 40273b3

File tree

5 files changed

+273
-23
lines changed

5 files changed

+273
-23
lines changed

.changeset/rich-ways-exercise.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"eslint-plugin-regexp": minor
3+
---
4+
5+
Improve `regexp/sort-alternatives` rule to add support for string alternatives and v flag

docs/rules/sort-alternatives.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ This rule will sort alternatives to improve readability and maintainability.
1919

2020
The primary target of this rule are lists of words and/or numbers. These lists are somewhat common, and sorting them makes it easy for readers to check whether a particular word or number is included.
2121

22-
This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.
22+
This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.\
23+
However, character classes containing strings are ensured to match the longest string, so they can always be sorted.
2324

2425
<eslint-code-block fix>
2526

@@ -29,11 +30,13 @@ This rule will only sort alternatives if reordering the alternatives doesn't aff
2930
/* ✓ GOOD */
3031
var foo = /\b(1|2|3)\b/;
3132
var foo = /\b(alpha|beta|gamma)\b/;
33+
var foo = /[\q{blue|green|red}]/v;
3234

3335
/* ✗ BAD */
3436
var foo = /\b(2|1|3)\b/;
3537
var foo = /__(?:Foo|Bar)__/;
3638
var foo = /\((?:TM|R|C)\)/;
39+
var foo = /[\q{red|green|blue}]/v;
3740
```
3841

3942
</eslint-code-block>

lib/rules/sort-alternatives.ts

Lines changed: 123 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ import type {
44
Character,
55
CharacterClass,
66
CharacterSet,
7+
ClassStringDisjunction,
78
Element,
8-
Node,
9+
ExpressionCharacterClass,
910
Pattern,
11+
StringAlternative,
1012
} from "@eslint-community/regexpp/ast"
1113
import type { RegExpContext } from "../utils"
1214
import {
@@ -30,12 +32,14 @@ import {
3032
hasSomeDescendant,
3133
canReorder,
3234
getLongestPrefix,
33-
toCharSet,
3435
getConsumedChars,
36+
toUnicodeSet,
37+
hasStrings,
3538
} from "regexp-ast-analysis"
3639
import type { CharSet, Word, ReadonlyWord } from "refa"
3740
import { NFA, JS, transform } from "refa"
3841
import { getParser } from "../utils/regexp-ast"
42+
import { getLexicographicallySmallestInConcatenation } from "../utils/lexicographically-smallest"
3943

4044
interface AllowedChars {
4145
allowed: CharSet
@@ -45,7 +49,10 @@ const cache = new Map<string, Readonly<AllowedChars>>()
4549

4650
function getAllowedChars(flags: ReadonlyFlags) {
4751
assertValidFlags(flags)
48-
const cacheKey = (flags.ignoreCase ? "i" : "") + (flags.unicode ? "u" : "")
52+
const cacheKey =
53+
(flags.ignoreCase ? "i" : "") +
54+
(flags.unicode ? "u" : "") +
55+
(flags.unicodeSets ? "v" : "")
4956
let result = cache.get(cacheKey)
5057
if (result === undefined) {
5158
result = {
@@ -86,7 +93,8 @@ function containsOnlyLiterals(
8693
d.type === "Backreference" ||
8794
d.type === "CharacterSet" ||
8895
(d.type === "Quantifier" && d.max === Infinity) ||
89-
(d.type === "CharacterClass" && d.negate)
96+
(d.type === "CharacterClass" && d.negate) ||
97+
(d.type === "ExpressionCharacterClass" && d.negate)
9098
)
9199
},
92100
(d) => d.type !== "Assertion",
@@ -156,29 +164,45 @@ function approximateLexicographicallySmallest(
156164
return getLexicographicallySmallestFromCharSets(prefix)
157165
}
158166

167+
function getLexicographicallySmallestFromAlternative(
168+
alternative: Alternative,
169+
parser: JS.Parser,
170+
flags: ReadonlyFlags,
171+
): Word | undefined
172+
function getLexicographicallySmallestFromAlternative(
173+
alternative: StringAlternative,
174+
parser: JS.Parser,
175+
flags: ReadonlyFlags,
176+
): Word
159177
/**
160178
* If defined, this will return the lexicographically smallest string accepted
161179
* by the given alternative (ignoring assertions).
162180
*/
163181
function getLexicographicallySmallestFromAlternative(
164-
alternative: Alternative,
182+
alternative: Alternative | StringAlternative,
165183
parser: JS.Parser,
166184
flags: ReadonlyFlags,
167185
): Word | undefined {
168-
const { elements } = alternative
169-
if (isOnlyCharacters(elements)) {
186+
if (
187+
alternative.type === "StringAlternative" ||
188+
hasOnlyCharacters(alternative, flags)
189+
) {
170190
// fast path to avoid converting simple alternatives into NFAs
171191
const smallest: Word = []
172-
for (const e of elements) {
173-
// FIXME: TS Error
174-
// @ts-expect-error -- FIXME
175-
const cs = toCharSet(e, flags)
192+
for (const e of alternative.elements) {
193+
const cs = toUnicodeSet(e, flags).chars
176194
if (cs.isEmpty) return undefined
177195
smallest.push(cs.ranges[0].min)
178196
}
179197
return smallest
180198
}
181199

200+
if (isOnlyCharacterElements(alternative.elements)) {
201+
return getLexicographicallySmallestInConcatenation(
202+
alternative.elements.map((e) => toUnicodeSet(e, flags)),
203+
)
204+
}
205+
182206
try {
183207
const result = parser.parseElement(alternative, {
184208
assertions: "unknown",
@@ -212,15 +236,45 @@ function getLexicographicallySmallestFromAlternative(
212236
}
213237
}
214238

215-
/** Returns whether the given array of nodes contains only characters. */
216-
function isOnlyCharacters(
217-
nodes: readonly Node[],
218-
): nodes is readonly (Character | CharacterClass | CharacterSet)[] {
239+
/**
240+
* Returns whether the given array of nodes contains only characters.
241+
* But note that if the pattern has the v flag, the character class may contain strings.
242+
*/
243+
function isOnlyCharacterElements(
244+
nodes: Element[],
245+
): nodes is (
246+
| Character
247+
| CharacterClass
248+
| CharacterSet
249+
| ExpressionCharacterClass
250+
)[] {
219251
return nodes.every(
220252
(e) =>
221253
e.type === "Character" ||
222254
e.type === "CharacterClass" ||
223-
e.type === "CharacterSet",
255+
e.type === "CharacterSet" ||
256+
e.type === "ExpressionCharacterClass",
257+
)
258+
}
259+
260+
/**
261+
* Returns whether the given alternative has contains only characters.
262+
* The v flag in the pattern does not contains the string.
263+
*/
264+
function hasOnlyCharacters(
265+
alternative: Alternative,
266+
flags: ReadonlyFlags,
267+
): alternative is Alternative & {
268+
elements: readonly (
269+
| Character
270+
| CharacterClass
271+
| CharacterSet
272+
| ExpressionCharacterClass
273+
)[]
274+
} {
275+
return (
276+
isOnlyCharacterElements(alternative.elements) &&
277+
alternative.elements.every((e) => !hasStrings(e, flags))
224278
)
225279
}
226280

@@ -433,6 +487,28 @@ function sortAlternatives(
433487
})
434488
}
435489

490+
/**
491+
* Sorts the given string alternatives.
492+
*
493+
* Sorting is done by comparing the lexicographically smallest strings (LSS).
494+
*
495+
* For more information on why we use LSS-based comparison and how it works,
496+
* see https://github.com/ota-meshi/eslint-plugin-regexp/pull/423.
497+
*/
498+
function sortStringAlternatives(
499+
alternatives: StringAlternative[],
500+
parser: JS.Parser,
501+
flags: ReadonlyFlags,
502+
): void {
503+
alternatives.sort((a, b) => {
504+
const lssDiff = compareWords(
505+
getLexicographicallySmallestFromAlternative(a, parser, flags),
506+
getLexicographicallySmallestFromAlternative(b, parser, flags),
507+
)
508+
return lssDiff
509+
})
510+
}
511+
436512
/**
437513
* Returns whether the given string is a valid integer.
438514
* @param str
@@ -446,7 +522,9 @@ function isIntegerString(str: string): boolean {
446522
* This tries to sort the given alternatives by assuming that all alternatives
447523
* are a number.
448524
*/
449-
function trySortNumberAlternatives(alternatives: Alternative[]): void {
525+
function trySortNumberAlternatives(
526+
alternatives: (Alternative | StringAlternative)[],
527+
): void {
450528
const runs = getRuns(alternatives, (a) => isIntegerString(a.raw))
451529
for (const { startIndex, elements } of runs) {
452530
elements.sort((a, b) => {
@@ -528,7 +606,7 @@ export default createRule("sort-alternatives", {
528606
fixable: "code",
529607
schema: [],
530608
messages: {
531-
sort: "The alternatives of this group can be sorted without affecting the regex.",
609+
sort: "The {{alternatives}} can be sorted without affecting the regex.",
532610
},
533611
type: "suggestion", // "problem",
534612
},
@@ -551,7 +629,6 @@ export default createRule("sort-alternatives", {
551629
let chars = possibleCharsCache.get(a)
552630
if (chars === undefined) {
553631
chars = getConsumedChars(a, flags).chars
554-
possibleCharsCache.set(a, chars)
555632
}
556633
return chars
557634
}
@@ -590,14 +667,19 @@ export default createRule("sort-alternatives", {
590667
}
591668
}
592669

593-
enforceSorted(run)
670+
enforceSorted(run, "alternatives of this group")
594671
}
595672

596673
/**
597674
* Creates a report if the sorted alternatives are different from
598675
* the unsorted ones.
599676
*/
600-
function enforceSorted(run: Run<Alternative>): void {
677+
function enforceSorted(
678+
run: Run<Alternative | StringAlternative>,
679+
alternatives:
680+
| "alternatives of this group"
681+
| "string alternatives",
682+
): void {
601683
const sorted = run.elements
602684
const parent = sorted[0].parent
603685
const unsorted = parent.alternatives.slice(
@@ -619,6 +701,7 @@ export default createRule("sort-alternatives", {
619701
node,
620702
loc,
621703
messageId: "sort",
704+
data: { alternatives },
622705
fix: fixReplaceNode(parent, () => {
623706
const prefix = parent.raw.slice(
624707
0,
@@ -682,10 +765,29 @@ export default createRule("sort-alternatives", {
682765
}
683766
}
684767

768+
/** The handler for ClassStringDisjunction */
769+
function onClassStringDisjunction(
770+
parent: ClassStringDisjunction,
771+
): void {
772+
if (parent.alternatives.length < 2) {
773+
return
774+
}
775+
776+
const alternatives = [...parent.alternatives]
777+
sortStringAlternatives(alternatives, parser, flags)
778+
trySortNumberAlternatives(alternatives)
779+
const run: Run<StringAlternative> = {
780+
startIndex: 0,
781+
elements: [...alternatives],
782+
}
783+
enforceSorted(run, "string alternatives")
784+
}
785+
685786
return {
686787
onGroupEnter: onParent,
687788
onPatternEnter: onParent,
688789
onCapturingGroupEnter: onParent,
790+
onClassStringDisjunctionEnter: onClassStringDisjunction,
689791
}
690792
}
691793

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import type { Word } from "refa"
2+
import type { JS } from "refa"
3+
4+
function findMin<T>(
5+
array: readonly T[],
6+
compare: (a: T, b: T) => number,
7+
): T | undefined {
8+
if (array.length === 0) {
9+
return undefined
10+
}
11+
12+
let min = array[0]
13+
for (let i = 1; i < array.length; i++) {
14+
const item = array[i]
15+
if (compare(item, min) < 0) {
16+
min = item
17+
}
18+
}
19+
return min
20+
}
21+
22+
function compareWords(a: Word, b: Word): number {
23+
const l = Math.min(a.length, b.length)
24+
for (let i = 0; i < l; i++) {
25+
const diff = a[i] - b[i]
26+
if (diff !== 0) {
27+
return diff
28+
}
29+
}
30+
return a.length - b.length
31+
}
32+
33+
/**
34+
* Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
35+
*/
36+
export function getLexicographicallySmallest(
37+
set: JS.UnicodeSet,
38+
): Word | undefined {
39+
if (set.accept.isEmpty) {
40+
return set.chars.isEmpty ? undefined : [set.chars.ranges[0].min]
41+
}
42+
43+
const words = set.accept.wordSets.map(
44+
(w): Word => w.map((c) => c.ranges[0].min),
45+
)
46+
return findMin(words, compareWords)
47+
}
48+
49+
/**
50+
* Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
51+
*/
52+
export function getLexicographicallySmallestInConcatenation(
53+
elements: readonly JS.UnicodeSet[],
54+
): Word | undefined {
55+
if (elements.length === 1) {
56+
return getLexicographicallySmallest(elements[0])
57+
}
58+
59+
let smallest: Word = []
60+
for (let i = elements.length - 1; i >= 0; i--) {
61+
const set = elements[i]
62+
if (set.isEmpty) {
63+
return undefined
64+
} else if (set.accept.isEmpty) {
65+
smallest.unshift(set.chars.ranges[0].min)
66+
} else {
67+
let words = [
68+
...(set.chars.isEmpty ? [] : [[set.chars]]),
69+
...set.accept.wordSets,
70+
].map((w): Word => w.map((c) => c.ranges[0].min))
71+
// we only have to consider the lexicographically smallest words with unique length
72+
const seenLengths = new Set<number>()
73+
words = words.sort(compareWords).filter((w) => {
74+
if (seenLengths.has(w.length)) {
75+
return false
76+
}
77+
seenLengths.add(w.length)
78+
return true
79+
})
80+
81+
smallest = findMin(
82+
// eslint-disable-next-line no-loop-func -- x
83+
words.map((w): Word => [...w, ...smallest]),
84+
compareWords,
85+
)!
86+
}
87+
}
88+
return smallest
89+
}

0 commit comments

Comments
 (0)