|
1 | 1 | /** Provides a class hierarchy corresponding to a parse tree of regular expressions. */
|
2 | 2 |
|
3 |
| -private import java |
4 |
| -private import semmle.code.java.regex.regex |
| 3 | +private import semmle.code.java.regex.regex as RE // importing under a namescape to avoid naming conflict for `Top`. |
| 4 | +private import codeql.regex.nfa.NfaUtils as NfaUtils |
| 5 | +// exporting as RegexTreeView, and in the top-level scope. |
| 6 | +import Impl as RegexTreeView |
5 | 7 | import Impl
|
6 | 8 |
|
7 | 9 | /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
|
8 |
| -RegExpTerm getParsedRegExp(StringLiteral re) { result.getRegex() = re and result.isRootTerm() } |
| 10 | +RegExpTerm getParsedRegExp(RE::StringLiteral re) { result.getRegex() = re and result.isRootTerm() } |
| 11 | + |
| 12 | +private class Regex = RE::Regex; |
| 13 | + |
| 14 | +private class Location = RE::Location; |
| 15 | + |
| 16 | +private class File = RE::File; |
9 | 17 |
|
10 | 18 | /**
|
11 | 19 | * An element containing a regular expression term, that is, either
|
@@ -53,7 +61,10 @@ private newtype TRegExpParent =
|
53 | 61 | /** A back reference */
|
54 | 62 | TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
|
55 | 63 |
|
56 |
| -module Impl { |
| 64 | +private import codeql.regex.RegexTreeView |
| 65 | + |
| 66 | +/** An implementation that statisfies the RegexTreeView signature. */ |
| 67 | +module Impl implements RegexTreeViewSig { |
57 | 68 | /**
|
58 | 69 | * An element containing a regular expression term, that is, either
|
59 | 70 | * a string literal (parsed as a regular expression; the root of the parse tree)
|
@@ -547,6 +558,13 @@ module Impl {
|
547 | 558 | }
|
548 | 559 | }
|
549 | 560 |
|
| 561 | + /** |
| 562 | + * A word boundary, that is, a regular expression term of the form `\b`. |
| 563 | + */ |
| 564 | + class RegExpWordBoundary extends RegExpSpecialChar { |
| 565 | + RegExpWordBoundary() { this.getChar() = "\\b" } |
| 566 | + } |
| 567 | + |
550 | 568 | /**
|
551 | 569 | * Gets the hex number for the `hex` char.
|
552 | 570 | */
|
@@ -1088,4 +1106,69 @@ module Impl {
|
1088 | 1106 |
|
1089 | 1107 | override string getPrimaryQLClass() { result = "RegExpBackRef" }
|
1090 | 1108 | }
|
| 1109 | + |
| 1110 | + class Top = RegExpParent; |
| 1111 | + |
| 1112 | + /** |
| 1113 | + * Holds if `term` is an escape class representing e.g. `\d`. |
| 1114 | + * `clazz` is which character class it represents, e.g. "d" for `\d`. |
| 1115 | + */ |
| 1116 | + predicate isEscapeClass(RegExpTerm term, string clazz) { |
| 1117 | + term.(RegExpCharacterClassEscape).getValue() = clazz |
| 1118 | + or |
| 1119 | + term.(RegExpNamedProperty).getBackslashEquivalent() = clazz |
| 1120 | + } |
| 1121 | + |
| 1122 | + /** |
| 1123 | + * Holds if `term` is a possessive quantifier, e.g. `a*+`. |
| 1124 | + */ |
| 1125 | + predicate isPossessive(RegExpQuantifier term) { term.isPossessive() } |
| 1126 | + |
| 1127 | + /** |
| 1128 | + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. |
| 1129 | + */ |
| 1130 | + predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() } |
| 1131 | + |
| 1132 | + /** |
| 1133 | + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. |
| 1134 | + */ |
| 1135 | + predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() } |
| 1136 | + |
| 1137 | + /** |
| 1138 | + * Holds if the regular expression should not be considered. |
| 1139 | + * |
| 1140 | + * We make the pragmatic performance optimization to ignore regular expressions in files |
| 1141 | + * that do not belong to the project code (such as installed dependencies). |
| 1142 | + */ |
| 1143 | + predicate isExcluded(RegExpParent parent) { |
| 1144 | + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) |
| 1145 | + or |
| 1146 | + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so |
| 1147 | + // we explicitly exclude these. |
| 1148 | + strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 |
| 1149 | + } |
| 1150 | + |
| 1151 | + /** |
| 1152 | + * Holds if `root` has the `i` flag for case-insensitive matching. |
| 1153 | + */ |
| 1154 | + predicate isIgnoreCase(RegExpTerm root) { |
| 1155 | + root.isRootTerm() and |
| 1156 | + root.getLiteral().isIgnoreCase() |
| 1157 | + } |
| 1158 | + |
| 1159 | + /** |
| 1160 | + * Gets the flags for `root`, or the empty string if `root` has no flags. |
| 1161 | + */ |
| 1162 | + deprecated string getFlags(RegExpTerm root) { |
| 1163 | + root.isRootTerm() and |
| 1164 | + result = root.getLiteral().getFlags() |
| 1165 | + } |
| 1166 | + |
| 1167 | + /** |
| 1168 | + * Holds if `root` has the `s` flag for multi-line matching. |
| 1169 | + */ |
| 1170 | + predicate isDotAll(RegExpTerm root) { |
| 1171 | + root.isRootTerm() and |
| 1172 | + root.getLiteral().isDotAll() |
| 1173 | + } |
1091 | 1174 | }
|
0 commit comments