Add regexp/strict rule (#220)

ota-meshi · RunDevelopment · web-flow · commit 3b076a755051 · 2021-06-04T12:19:04.000+02:00
Co-authored-by: Michael Schmidt &lt;mitchi5000.ms@googlemail.com&gt;
diff --git a/README.md b/README.md
@@ -112,6 +112,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
 | [regexp/no-useless-assertions](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-assertions.html) | disallow assertions that are known to always accept (or reject) |  |
 | [regexp/no-useless-backreference](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-backreference.html) | disallow useless backreferences in regular expressions | :star: |
 | [regexp/no-useless-dollar-replacements](https://ota-meshi.github.io/eslint-plugin-regexp/rules/no-useless-dollar-replacements.html) | disallow useless `$` replacements in replacement string |  |
+| [regexp/strict](https://ota-meshi.github.io/eslint-plugin-regexp/rules/strict.html) | disallow not strictly valid regular expressions | :wrench: |
 
 ### Best Practices
 
diff --git a/docs/rules/README.md b/docs/rules/README.md
@@ -26,6 +26,7 @@ The rules with the following star :star: are included in the `plugin:regexp/reco
 | [regexp/no-useless-assertions](./no-useless-assertions.md) | disallow assertions that are known to always accept (or reject) |  |
 | [regexp/no-useless-backreference](./no-useless-backreference.md) | disallow useless backreferences in regular expressions | :star: |
 | [regexp/no-useless-dollar-replacements](./no-useless-dollar-replacements.md) | disallow useless `$` replacements in replacement string |  |
+| [regexp/strict](./strict.md) | disallow not strictly valid regular expressions | :wrench: |
 
 ### Best Practices
 
diff --git a/docs/rules/strict.md b/docs/rules/strict.md
@@ -0,0 +1,64 @@
+---
+pageClass: "rule-details"
+sidebarDepth: 0
+title: "regexp/strict"
+description: "disallow not strictly valid regular expressions"
+---
+# regexp/strict
+
+> disallow not strictly valid regular expressions
+
+- :exclamation: <badge text="This rule has not been released yet." vertical="middle" type="error"> ***This rule has not been released yet.*** </badge>
+- :wrench: The `--fix` option on the [command line](https://eslint.org/docs/user-guide/command-line-interface#fixing-problems) can automatically fix some of the problems reported by this rule.
+
+## :book: Rule Details
+
+This rule disallows not strictly valid regular expressions.
+
+An invalid pattern in a regular expression literal is a `SyntaxError` when the code is parsed. However, it is not always strictly checked.
+
+Depending on the syntax defined in [Annex B] of the ECMAScript specification, some ambiguous pattern syntax may also succeed in parsing as a valid pattern. This rule reports these ambiguous patterns.
+
+[Annex B]: https://tc39.es/ecma262/#sec-regular-expressions-patterns
+
+<eslint-code-block fix>
+
+```js
+/* eslint regexp/strict: "error" */
+
+/* ✓ GOOD */
+var foo = /\}/
+var foo = /\{/
+var foo = /\]/
+var foo = /\u{42}/u; // It matches "B".
+var foo = /u{42}/; // It matches a string followed by 42 "u"s.
+
+/* ✗ BAD */
+var foo = /}/
+var foo = /{/
+var foo = /]/
+var foo = /\u{42}/; // It matches a string followed by 42 "u"s.
+```
+
+</eslint-code-block>
+
+## :wrench: Options
+
+Nothing.
+
+## :books: Further reading
+
+- [ECMAScript® 2022 Language Specification > Annex B > B.1.4 Regular Expressions Patterns](https://tc39.es/ecma262/#sec-regular-expressions-patterns)
+
+## :couple: Related rules
+
+- [no-invalid-regexp]
+- [regexp/no-standalone-backslash]
+
+[no-invalid-regexp]: https://eslint.org/docs/rules/no-invalid-regexp
+[regexp/no-standalone-backslash]: ./no-standalone-backslash.md
+
+## :mag: Implementation
+
+- [Rule source](https://github.com/ota-meshi/eslint-plugin-regexp/blob/master/lib/rules/strict.ts)
+- [Test source](https://github.com/ota-meshi/eslint-plugin-regexp/blob/master/tests/lib/rules/strict.ts)
diff --git a/lib/rules/strict.ts b/lib/rules/strict.ts
@@ -0,0 +1,275 @@
+import { RegExpValidator } from "regexpp"
+import type { CharacterClassElement, Element } from "regexpp/ast"
+import type { RegExpVisitor } from "regexpp/visitor"
+import type { RegExpContext } from "../utils"
+import {
+    isOctalEscape,
+    createRule,
+    defineRegexpVisitor,
+    isEscapeSequence,
+} from "../utils"
+
+const validator = new RegExpValidator({ strict: true, ecmaVersion: 2020 })
+
+/**
+ * Check syntax error in a given pattern.
+ * @returns The syntax error.
+ */
+function validateRegExpPattern(
+    pattern: string,
+    uFlag?: boolean,
+): string | null {
+    try {
+        validator.validatePattern(pattern, undefined, undefined, uFlag)
+        return null
+    } catch (err) {
+        return err.message
+    }
+}
+
+const CHARACTER_CLASS_SYNTAX_CHARACTERS = new Set("\\/()[]{}^$.|-+*?".split(""))
+const SYNTAX_CHARACTERS = new Set("\\/()[]{}^$.|+*?".split(""))
+
+export default createRule("strict", {
+    meta: {
+        docs: {
+            description: "disallow not strictly valid regular expressions",
+            category: "Possible Errors",
+            // TODO Switch to recommended in the major version.
+            // recommended: true,
+            recommended: false,
+        },
+        fixable: "code",
+        schema: [],
+        messages: {
+            // character escape
+            invalidControlEscape:
+                "Invalid or incomplete control escape sequence. Either use a valid control escape sequence or escaping the standalone backslash.",
+            incompleteEscapeSequence:
+                "Incomplete escape sequence '{{expr}}'. Either use a valid escape sequence or remove the useless escaping.",
+            invalidPropertyEscape:
+                "Invalid property escape sequence '{{expr}}'. Either use a valid property escape sequence or remove the useless escaping.",
+            incompleteBackreference:
+                "Incomplete backreference '{{expr}}'. Either use a valid backreference or remove the useless escaping.",
+            unescapedSourceCharacter: "Unescaped source character '{{expr}}'.",
+            octalEscape:
+                "Invalid legacy octal escape sequence '{{expr}}'. Use a hexadecimal escape instead.",
+            uselessEscape:
+                "Useless identity escapes with non-syntax characters are forbidden.",
+
+            // character class
+            invalidRange:
+                "Invalid character class range. A character set cannot be the minimum or maximum of a character class range. Either escape the `-` or fix the character class range.",
+
+            // assertion
+            quantifiedAssertion:
+                "Assertion are not allowed to be quantified directly.",
+
+            // validator
+            regexMessage: "{{message}}.",
+        },
+        type: "suggestion",
+    },
+    create(context) {
+        /**
+         * Create visitor
+         */
+        function createVisitor(
+            regexpContext: RegExpContext,
+        ): RegExpVisitor.Handlers {
+            const {
+                node,
+                flags,
+                pattern,
+                getRegexpLocation,
+                fixReplaceNode,
+            } = regexpContext
+
+            if (flags.unicode) {
+                // the Unicode flag enables strict parsing mode automatically
+                return {}
+            }
+
+            let reported = false
+            let hasNamedBackreference = false
+
+            /** Report */
+            function report(
+                messageId: string,
+                element: Element,
+                fix?: string | null,
+            ): void {
+                reported = true
+
+                context.report({
+                    node,
+                    loc: getRegexpLocation(element),
+                    messageId,
+                    data: {
+                        expr: element.raw,
+                    },
+                    fix: fix ? fixReplaceNode(element, fix) : null,
+                })
+            }
+
+            return {
+                // eslint-disable-next-line complexity -- x
+                onCharacterEnter(cNode) {
+                    if (cNode.raw === "\\") {
+                        // e.g. \c5 or \c
+                        report("invalidControlEscape", cNode)
+                        return
+                    }
+                    if (cNode.raw === "\\u" || cNode.raw === "\\x") {
+                        // e.g. \u000;
+                        report("incompleteEscapeSequence", cNode)
+                        return
+                    }
+                    if (cNode.raw === "\\p" || cNode.raw === "\\P") {
+                        // e.g. \p{H} or \p
+                        report("invalidPropertyEscape", cNode)
+                        return
+                    }
+                    if (cNode.value !== 0 && isOctalEscape(cNode.raw)) {
+                        // e.g. \023
+                        report(
+                            "octalEscape",
+                            cNode,
+                            `\\x${cNode.value.toString(16).padStart(2, "0")}`,
+                        )
+                        return
+                    }
+
+                    const insideCharClass =
+                        cNode.parent.type === "CharacterClass" ||
+                        cNode.parent.type === "CharacterClassRange"
+
+                    if (!insideCharClass) {
+                        if (cNode.raw === "\\k") {
+                            // e.g. \k<foo or \k
+                            report("incompleteBackreference", cNode)
+                            return
+                        }
+
+                        if (
+                            cNode.raw === "{" ||
+                            cNode.raw === "}" ||
+                            cNode.raw === "]"
+                        ) {
+                            report(
+                                "unescapedSourceCharacter",
+                                cNode,
+                                `\\${cNode.raw}`,
+                            )
+                            return
+                        }
+                    }
+
+                    if (isEscapeSequence(cNode.raw)) {
+                        // all remaining escape sequences are valid
+                        return
+                    }
+
+                    if (cNode.raw.startsWith("\\")) {
+                        const identity = cNode.raw.slice(1)
+                        const syntaxChars = insideCharClass
+                            ? CHARACTER_CLASS_SYNTAX_CHARACTERS
+                            : SYNTAX_CHARACTERS
+
+                        if (
+                            cNode.value === identity.charCodeAt(0) &&
+                            !syntaxChars.has(identity)
+                        ) {
+                            // e.g. \g or \;
+                            report("uselessEscape", cNode, identity)
+                        }
+                    }
+                },
+                onCharacterClassEnter(ccNode) {
+                    for (let i = 0; i < ccNode.elements.length; i++) {
+                        const current = ccNode.elements[i]
+
+                        if (current.type === "CharacterSet") {
+                            const next: CharacterClassElement | undefined =
+                                ccNode.elements[i + 1]
+                            const nextNext: CharacterClassElement | undefined =
+                                ccNode.elements[i + 2]
+
+                            if (next && next.raw === "-" && nextNext) {
+                                // e.g. [\w-a]
+                                report("invalidRange", current)
+                                return
+                            }
+
+                            const prev: CharacterClassElement | undefined =
+                                ccNode.elements[i - 1]
+                            const prevPrev: CharacterClassElement | undefined =
+                                ccNode.elements[i - 2]
+                            if (
+                                prev &&
+                                prev.raw === "-" &&
+                                prevPrev &&
+                                prevPrev.type !== "CharacterClassRange"
+                            ) {
+                                // e.g. [a-\w]
+                                report("invalidRange", current)
+                                return
+                            }
+                        }
+                    }
+                },
+                onQuantifierEnter(qNode) {
+                    if (qNode.element.type === "Assertion") {
+                        // e.g. \b+
+                        report(
+                            "quantifiedAssertion",
+                            qNode,
+                            `(?:${qNode.element.raw})${qNode.raw.slice(
+                                qNode.element.end - qNode.start,
+                            )}`,
+                        )
+                    }
+                },
+
+                onBackreferenceEnter(bNode) {
+                    if (typeof bNode.ref === "string") {
+                        hasNamedBackreference = true
+                    }
+                },
+                onPatternLeave() {
+                    if (hasNamedBackreference) {
+                        // There is a bug in regexpp that causes it throw a
+                        // syntax error for all non-Unicode regexes with named
+                        // backreferences.
+                        // TODO: Remove this workaround when the bug is fixed.
+                        return
+                    }
+
+                    if (!reported) {
+                        // our own logic couldn't find any problems,
+                        // so let's use a real parser to do the job.
+
+                        const message = validateRegExpPattern(
+                            pattern,
+                            flags.unicode,
+                        )
+
+                        if (message) {
+                            context.report({
+                                node,
+                                messageId: "regexMessage",
+                                data: {
+                                    message,
+                                },
+                            })
+                        }
+                    }
+                },
+            }
+        }
+
+        return defineRegexpVisitor(context, {
+            createVisitor,
+        })
+    },
+})
diff --git a/lib/utils/rules.ts b/lib/utils/rules.ts
@@ -58,6 +58,7 @@ import preferT from "../rules/prefer-t"
 import preferUnicodeCodepointEscapes from "../rules/prefer-unicode-codepoint-escapes"
 import preferW from "../rules/prefer-w"
 import sortFlags from "../rules/sort-flags"
+import strict from "../rules/strict"
 import unicodeEscape from "../rules/unicode-escape"
 
 export const rules = [
@@ -120,5 +121,6 @@ export const rules = [
     preferUnicodeCodepointEscapes,
     preferW,
     sortFlags,
+    strict,
     unicodeEscape,
 ] as RuleModule[]
diff --git a/tests/lib/rules/strict.ts b/tests/lib/rules/strict.ts