Skip to content

Commit c9854de

Browse files
Improve sort-alternative's comparison function (#320)
1 parent 3be66be commit c9854de

File tree

4 files changed

+354
-223
lines changed

4 files changed

+354
-223
lines changed

lib/rules/sort-alternatives.ts

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import type { CharSet } from "refa"
2222
import { JS } from "refa"
2323
import { canReorder } from "../utils/reorder-alternatives"
2424
import { getPossiblyConsumedChar } from "../utils/regexp-ast"
25+
import { getLongestPrefix } from "../utils/regexp-ast/alternative-prefix"
2526

2627
interface AllowedChars {
2728
allowed: CharSet
@@ -83,9 +84,61 @@ function containsOnlyLiterals(
8384
* Compare two string independent of the current locale by byte order.
8485
*/
8586
function compareByteOrder(a: string, b: string): number {
87+
if (a === b) {
88+
return 0
89+
}
90+
return a < b ? -1 : +1
91+
}
92+
93+
/**
94+
* Compare two char sets by byte order.
95+
*/
96+
function compareCharSets(a: CharSet, b: CharSet): number {
97+
// empty char set > everything else
98+
if (a.isEmpty) {
99+
return 1
100+
} else if (b.isEmpty) {
101+
return -1
102+
}
103+
104+
// the first character is different
105+
if (a.ranges[0].min !== b.ranges[0].min) {
106+
return a.ranges[0].min - b.ranges[0].min
107+
}
108+
109+
// Now for the difficult part: We want to compare them by byte-order but
110+
// what does that mean for a set of characters?
111+
// We will define it as such: Let x be the smallest character in the
112+
// symmetric difference of a and b. If x is in a then a < b. Otherwise
113+
// b < a. If the symmetric difference is empty, then a == b.
114+
115+
const symDiff = a.union(b).without(a.intersect(b))
116+
if (symDiff.isEmpty) {
117+
// a == b
118+
return 0
119+
}
120+
121+
const min = symDiff.ranges[0].min
122+
123+
if (a.has(min)) {
124+
// a < b
125+
return -1
126+
}
127+
128+
// b < a
129+
return 1
130+
}
131+
132+
/**
133+
* Compare two strings of char sets by byte order.
134+
*/
135+
function compareCharSetStrings(
136+
a: readonly CharSet[],
137+
b: readonly CharSet[],
138+
): number {
86139
const l = Math.min(a.length, b.length)
87140
for (let i = 0; i < l; i++) {
88-
const diff = a.charCodeAt(i) - b.charCodeAt(i)
141+
const diff = compareCharSets(a[i], b[i])
89142
if (diff !== 0) {
90143
return diff
91144
}
@@ -112,10 +165,12 @@ function sortAlternatives(
112165
}
113166

114167
alternatives.sort((a, b) => {
115-
const firstA = firstChars.get(a)!
116-
const firstB = firstChars.get(b)!
117-
if (firstA !== firstB) {
118-
return firstA - firstB
168+
const prefixDiff = compareCharSetStrings(
169+
getLongestPrefix(a, "ltr", context.flags),
170+
getLongestPrefix(b, "ltr", context.flags),
171+
)
172+
if (prefixDiff !== 0) {
173+
return prefixDiff
119174
}
120175

121176
if (context.flags.ignoreCase) {
Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
import type { CharSet } from "refa"
2+
import type {
3+
FirstConsumedChar,
4+
MatchingDirection,
5+
ReadonlyFlags,
6+
} from "regexp-ast-analysis"
7+
import {
8+
getFirstCharAfter,
9+
// eslint-disable-next-line no-restricted-imports -- x
10+
toCharSet,
11+
getFirstConsumedChar,
12+
getFirstConsumedCharAfter,
13+
FirstConsumedChars,
14+
isZeroLength,
15+
isPotentiallyZeroLength,
16+
isStrictBackreference,
17+
} from "regexp-ast-analysis"
18+
import type {
19+
Alternative,
20+
CapturingGroup,
21+
Element,
22+
Group,
23+
Quantifier,
24+
} from "regexpp/ast"
25+
26+
const ltrCache = new WeakMap<Alternative, readonly CharSet[]>()
27+
const rtlCache = new WeakMap<Alternative, readonly CharSet[]>()
28+
29+
/**
30+
* Returns the longest knowable prefix of characters accepted by the given
31+
* alternative and after it.
32+
*
33+
* The returned set of characters may contain the first character after the
34+
* given alternative.
35+
*
36+
* All returned character set are guaranteed to be non-empty.
37+
*/
38+
export function getLongestPrefix(
39+
alternative: Alternative,
40+
direction: MatchingDirection,
41+
flags: ReadonlyFlags,
42+
): readonly CharSet[] {
43+
const cache = direction === "ltr" ? ltrCache : rtlCache
44+
let cached = cache.get(alternative)
45+
if (cached === undefined) {
46+
cached = getLongestPrefixUncached(alternative, direction, flags)
47+
cache.set(alternative, cached)
48+
}
49+
return cached
50+
}
51+
52+
/** Uncached version of {@link getLongestPrefix} */
53+
function getLongestPrefixUncached(
54+
alternative: Alternative,
55+
direction: MatchingDirection,
56+
flags: ReadonlyFlags,
57+
): readonly CharSet[] {
58+
const prefix = getAlternativePrefix(alternative, direction, flags)
59+
let { chars } = prefix
60+
61+
if (prefix.complete) {
62+
chars.push(getFirstCharAfter(alternative, direction, flags).char)
63+
}
64+
65+
// remove everything after an empty char set
66+
for (let i = 0; i < chars.length; i++) {
67+
if (chars[i].isEmpty) {
68+
chars = chars.slice(0, i)
69+
break
70+
}
71+
}
72+
73+
return chars
74+
}
75+
76+
interface Prefix {
77+
chars: CharSet[]
78+
complete: boolean
79+
}
80+
81+
/** Returns the prefix of the given alternative */
82+
function getAlternativePrefix(
83+
alternative: Alternative,
84+
direction: MatchingDirection,
85+
flags: ReadonlyFlags,
86+
): Prefix {
87+
const { elements } = alternative
88+
89+
const chars: CharSet[] = []
90+
91+
const first = direction === "ltr" ? 0 : elements.length - 1
92+
const inc = direction === "ltr" ? +1 : -1
93+
for (let i = first; i >= 0 && i < elements.length; i += inc) {
94+
const inner = getElementPrefix(elements[i], direction, flags)
95+
chars.push(...inner.chars)
96+
97+
if (!inner.complete) {
98+
return { chars, complete: false }
99+
}
100+
}
101+
102+
return { chars, complete: true }
103+
}
104+
105+
/** Returns the prefix of the given element */
106+
function getElementPrefix(
107+
element: Element,
108+
direction: MatchingDirection,
109+
flags: ReadonlyFlags,
110+
): Prefix {
111+
switch (element.type) {
112+
case "Assertion":
113+
return { chars: [], complete: true }
114+
115+
case "Character":
116+
case "CharacterClass":
117+
case "CharacterSet":
118+
return {
119+
chars: [toCharSet(element, flags)],
120+
complete: true,
121+
}
122+
123+
case "CapturingGroup":
124+
case "Group":
125+
return getGroupPrefix(element, direction, flags)
126+
127+
case "Quantifier":
128+
return getQuantifierPrefix(element, direction, flags)
129+
130+
case "Backreference": {
131+
if (isStrictBackreference(element)) {
132+
const inner = getElementPrefix(
133+
element.resolved,
134+
direction,
135+
flags,
136+
)
137+
if (inner.complete) {
138+
return inner
139+
}
140+
}
141+
142+
const look = FirstConsumedChars.toLook(
143+
getFirstConsumedCharPlusAfter(element, direction, flags),
144+
)
145+
return { chars: [look.char], complete: false }
146+
}
147+
148+
default:
149+
throw new Error("unreachable")
150+
}
151+
}
152+
153+
/** Returns the prefix of the given group */
154+
function getGroupPrefix(
155+
element: Group | CapturingGroup,
156+
direction: MatchingDirection,
157+
flags: ReadonlyFlags,
158+
): Prefix {
159+
if (element.alternatives.length === 1) {
160+
return getAlternativePrefix(element.alternatives[0], direction, flags)
161+
}
162+
163+
const alternatives = element.alternatives.map((a) =>
164+
getAlternativePrefix(a, direction, flags),
165+
)
166+
167+
const chars: CharSet[] = []
168+
let complete = true
169+
for (let i = 0; complete; i++) {
170+
const cs: CharSet[] = []
171+
let end = false
172+
for (const a of alternatives) {
173+
if (i >= a.chars.length) {
174+
end = true
175+
} else {
176+
cs.push(a.chars[i])
177+
if (i === a.chars.length - 1 && !a.complete) {
178+
complete = false
179+
}
180+
}
181+
}
182+
183+
if (cs.length === 0) {
184+
// This means that all alternatives are complete and have the same
185+
// length, so we can stop here.
186+
break
187+
}
188+
189+
if (end) {
190+
// This means that one (but not all) complete alternatives have
191+
// reached the end, so we have consider the chars after the group.
192+
complete = false
193+
cs.push(getFirstCharAfter(element, direction, flags).char)
194+
}
195+
196+
const total = cs[0].union(...cs.slice(1))
197+
chars.push(total)
198+
}
199+
200+
return { chars, complete }
201+
}
202+
203+
/** Returns the prefix of the given quantifier */
204+
function getQuantifierPrefix(
205+
element: Quantifier,
206+
direction: MatchingDirection,
207+
flags: ReadonlyFlags,
208+
): Prefix {
209+
if (isZeroLength(element)) {
210+
return { chars: [], complete: true }
211+
}
212+
if (isPotentiallyZeroLength(element)) {
213+
const look = FirstConsumedChars.toLook(
214+
getFirstConsumedCharPlusAfter(element, direction, flags),
215+
)
216+
return { chars: [look.char], complete: false }
217+
}
218+
219+
const inner = getElementPrefix(element.element, direction, flags)
220+
if (!inner.complete) {
221+
return inner
222+
}
223+
224+
const chars: CharSet[] = []
225+
for (let i = 0; i < element.min; i++) {
226+
chars.push(...inner.chars)
227+
if (chars.length > 100) {
228+
// this is a safe-guard to protect against regexes like a{100000}
229+
return { chars, complete: false }
230+
}
231+
}
232+
233+
if (element.min === element.max) {
234+
return { chars, complete: true }
235+
}
236+
237+
const look = FirstConsumedChars.toLook(
238+
getFirstConsumedCharAfter(element.element, direction, flags),
239+
)
240+
chars.push(look.char)
241+
return { chars, complete: false }
242+
}
243+
244+
/**
245+
* This operations is equal to:
246+
*
247+
* ```
248+
* concat(
249+
* getFirstConsumedChar(element, direction, flags),
250+
* getFirstConsumedCharAfter(element, direction, flags),
251+
* )
252+
* ```
253+
*/
254+
function getFirstConsumedCharPlusAfter(
255+
element: Element | Alternative,
256+
direction: MatchingDirection,
257+
flags: ReadonlyFlags,
258+
): FirstConsumedChar {
259+
const consumed = getFirstConsumedChar(element, direction, flags)
260+
261+
if (!consumed.empty) {
262+
return consumed
263+
}
264+
265+
return FirstConsumedChars.concat(
266+
[consumed, getFirstConsumedCharAfter(element, direction, flags)],
267+
flags,
268+
)
269+
}

0 commit comments

Comments
 (0)