1
1
import type { CharRange } from "refa"
2
2
import { visitRegExpAST , RegExpParser } from "regexpp"
3
- import type { Pattern } from "regexpp/ast"
3
+ import type {
4
+ Character ,
5
+ CharacterClass ,
6
+ CharacterSet ,
7
+ Node ,
8
+ Pattern ,
9
+ Quantifier ,
10
+ } from "regexpp/ast"
4
11
import type { RegExpVisitor } from "regexpp/visitor"
5
12
import type { RegExpContext } from "../utils"
6
13
import { createRule , defineRegexpVisitor } from "../utils"
7
- import { hasSomeDescendant , toCache , toCharSet } from "regexp-ast-analysis"
14
+ import type { ReadonlyFlags } from "regexp-ast-analysis"
15
+ import {
16
+ hasSomeDescendant ,
17
+ toCache ,
18
+ toCharSet ,
19
+ getFirstCharAfter ,
20
+ } from "regexp-ast-analysis"
8
21
9
22
const UTF16_MAX = 0xffff
10
23
@@ -93,7 +106,10 @@ function isSyntacticallyCompatible(pattern: Pattern): false | Pattern {
93
106
return uPattern
94
107
}
95
108
109
+ const HIGH_SURROGATES : CharRange = { min : 0xd800 , max : 0xdbff }
110
+ const LOW_SURROGATES : CharRange = { min : 0xdc00 , max : 0xdfff }
96
111
const SURROGATES : CharRange = { min : 0xd800 , max : 0xdfff }
112
+ const ASTRAL : CharRange = { min : 0x10000 , max : 0x10ffff }
97
113
98
114
/** Returns whether the two given ranges are equal. */
99
115
function rangeEqual ( a : readonly CharRange [ ] , b : readonly CharRange [ ] ) : boolean {
@@ -110,6 +126,111 @@ function rangeEqual(a: readonly CharRange[], b: readonly CharRange[]): boolean {
110
126
return true
111
127
}
112
128
129
+ type CharLike = Character | CharacterClass | CharacterSet
130
+
131
+ /** Whether the given element is character-like element. */
132
+ function isChar ( node : Node ) : node is CharLike {
133
+ return (
134
+ node . type === "Character" ||
135
+ node . type === "CharacterClass" ||
136
+ node . type === "CharacterSet"
137
+ )
138
+ }
139
+
140
+ /**
141
+ * Whether the given char-like accepts the same characters with and without
142
+ * the u flag.
143
+ */
144
+ function isCompatibleCharLike (
145
+ char : CharLike ,
146
+ flags : ReadonlyFlags ,
147
+ uFlags : ReadonlyFlags ,
148
+ ) : boolean {
149
+ const cs = toCharSet ( char , flags )
150
+ if ( ! cs . isDisjointWith ( SURROGATES ) ) {
151
+ // If the character (class/set) contains high or low
152
+ // surrogates, then we won't be able to guarantee that the
153
+ // Unicode pattern will behave the same way.
154
+ return false
155
+ }
156
+
157
+ const uCs = toCharSet ( char , uFlags )
158
+
159
+ // Compare the ranges.
160
+ return rangeEqual ( cs . ranges , uCs . ranges )
161
+ }
162
+
163
+ /**
164
+ * Whether the given quantifier accepts the same characters with and without
165
+ * the u flag.
166
+ *
167
+ * This will return `undefined` if the function cannot decide.
168
+ */
169
+ function isCompatibleQuantifier (
170
+ q : Quantifier ,
171
+ flags : ReadonlyFlags ,
172
+ uFlags : ReadonlyFlags ,
173
+ ) : boolean | undefined {
174
+ if ( ! isChar ( q . element ) ) {
175
+ return undefined
176
+ }
177
+
178
+ if ( isCompatibleCharLike ( q . element , flags , uFlags ) ) {
179
+ // trivial
180
+ return true
181
+ }
182
+
183
+ // A quantifier `n*` or `n+` is the same with and without the
184
+ // u flag if all of the following conditions are true:
185
+ //
186
+ // 1. The UTF16 characters of the element contain all
187
+ // surrogates characters (U+D800-U+DFFF).
188
+ // 2. The Unicode characters of the element contain all
189
+ // surrogates characters (U+D800-U+DFFF) and astral
190
+ // characters (U+10000-U+10FFFF).
191
+ // 3. All non-surrogate and non-astral characters of the UTF16
192
+ // and Unicode characters of the element as the same.
193
+ // 4. The first character before the quantifier is not a
194
+ // high surrogate (U+D800-U+DBFF).
195
+ // 5. The first character after the quantifier is not a
196
+ // low surrogate (U+DC00-U+DFFF).
197
+
198
+ if ( q . min > 1 || q . max !== Infinity ) {
199
+ return undefined
200
+ }
201
+
202
+ const cs = toCharSet ( q . element , flags )
203
+ if ( ! cs . isSupersetOf ( SURROGATES ) ) {
204
+ // failed condition 1
205
+ return false
206
+ }
207
+
208
+ const uCs = toCharSet ( q . element , uFlags )
209
+ if ( ! uCs . isSupersetOf ( SURROGATES ) || ! uCs . isSupersetOf ( ASTRAL ) ) {
210
+ // failed condition 2
211
+ return false
212
+ }
213
+
214
+ if ( ! rangeEqual ( cs . ranges , uCs . without ( [ ASTRAL ] ) . ranges ) ) {
215
+ // failed condition 3
216
+ return false
217
+ }
218
+
219
+ const before = getFirstCharAfter ( q , "rtl" , flags ) . char
220
+ if ( ! before . isDisjointWith ( HIGH_SURROGATES ) ) {
221
+ // failed condition 4
222
+ return false
223
+ }
224
+
225
+ const after = getFirstCharAfter ( q , "ltr" , flags ) . char
226
+ if ( ! after . isDisjointWith ( LOW_SURROGATES ) ) {
227
+ // failed condition 5
228
+ return false
229
+ }
230
+
231
+ return true
232
+ }
233
+
113
234
/**
114
235
* Returns whether the regex would keep its behaviour if the u flag were to be
115
236
* added.
@@ -133,6 +254,8 @@ function isSemanticallyCompatible(
133
254
const flags = regexpContext . flags
134
255
const uFlags = toCache ( { ...flags , unicode : true } )
135
256
257
+ const skip = new Set < Node > ( )
258
+
136
259
return ! hasSomeDescendant (
137
260
pattern ,
138
261
( n ) => {
@@ -154,28 +277,25 @@ function isSemanticallyCompatible(
154
277
return true
155
278
}
156
279
157
- if (
158
- n . type === "Character" ||
159
- n . type === "CharacterClass" ||
160
- n . type === "CharacterSet"
161
- ) {
162
- const cs = toCharSet ( n , flags )
163
- if ( ! cs . isDisjointWith ( SURROGATES ) ) {
164
- // If the character (class/set) contains high or low
165
- // surrogates, then we won't be able to guarantee that the
166
- // Unicode pattern will behave the same way.
167
- return true
168
- }
280
+ if ( isChar ( n ) ) {
281
+ return ! isCompatibleCharLike ( n , flags , uFlags )
282
+ }
283
+
284
+ if ( n . type === "Quantifier" ) {
285
+ const result = isCompatibleQuantifier ( n , flags , uFlags )
169
286
170
- // Compare the ranges.
171
- return ! rangeEqual ( cs . ranges , toCharSet ( n , uFlags ) . ranges )
287
+ if ( result !== undefined ) {
288
+ skip . add ( n )
289
+ return ! result
290
+ }
172
291
}
173
292
174
293
return false
175
294
} ,
176
295
( n ) => {
177
- // Don't go into character classes, we already checked them
178
- return n . type !== "CharacterClass"
296
+ // Don't go into character classes, we already checked them.
297
+ // We also don't want to go into elements, we explicitly skipped.
298
+ return n . type !== "CharacterClass" && ! skip . has ( n )
179
299
} ,
180
300
)
181
301
}
0 commit comments