|
2 | 2 |
|
3 | 3 | import python
|
4 | 4 | private import semmle.python.regex
|
| 5 | +private import codeql.regex.nfa.NfaUtils as NfaUtils |
| 6 | +private import codeql.regex.RegexTreeView |
| 7 | +// exporting as RegexTreeView, and in the top-level scope. |
| 8 | +import Impl as RegexTreeView |
5 | 9 | import Impl
|
6 | 10 |
|
7 | 11 | /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
|
@@ -52,8 +56,34 @@ private newtype TRegExpParent =
|
52 | 56 | /** A back reference */
|
53 | 57 | TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
|
54 | 58 |
|
| 59 | +pragma[nomagic] |
| 60 | +private int seqChildEnd(Regex re, int start, int end, int i) { |
| 61 | + result = seqChild(re, start, end, i).getEnd() |
| 62 | +} |
| 63 | + |
| 64 | +// moved out so we can use it in the charpred |
| 65 | +private RegExpTerm seqChild(Regex re, int start, int end, int i) { |
| 66 | + re.sequence(start, end) and |
| 67 | + ( |
| 68 | + i = 0 and |
| 69 | + result.getRegex() = re and |
| 70 | + result.getStart() = start and |
| 71 | + exists(int itemEnd | |
| 72 | + re.item(start, itemEnd) and |
| 73 | + result.getEnd() = itemEnd |
| 74 | + ) |
| 75 | + or |
| 76 | + i > 0 and |
| 77 | + result.getRegex() = re and |
| 78 | + exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | |
| 79 | + result.getStart() = itemStart and |
| 80 | + re.item(itemStart, result.getEnd()) |
| 81 | + ) |
| 82 | + ) |
| 83 | +} |
| 84 | + |
55 | 85 | /** An implementation that statisfies the RegexTreeView signature. */
|
56 |
| -module Impl { |
| 86 | +module Impl implements RegexTreeViewSig { |
57 | 87 | /**
|
58 | 88 | * An element containing a regular expression term, that is, either
|
59 | 89 | * a string literal (parsed as a regular expression)
|
@@ -391,32 +421,6 @@ module Impl {
|
391 | 421 | override string getPrimaryQLClass() { result = "RegExpSequence" }
|
392 | 422 | }
|
393 | 423 |
|
394 |
| - pragma[nomagic] |
395 |
| - private int seqChildEnd(Regex re, int start, int end, int i) { |
396 |
| - result = seqChild(re, start, end, i).getEnd() |
397 |
| - } |
398 |
| - |
399 |
| - // moved out so we can use it in the charpred |
400 |
| - private RegExpTerm seqChild(Regex re, int start, int end, int i) { |
401 |
| - re.sequence(start, end) and |
402 |
| - ( |
403 |
| - i = 0 and |
404 |
| - result.getRegex() = re and |
405 |
| - result.getStart() = start and |
406 |
| - exists(int itemEnd | |
407 |
| - re.item(start, itemEnd) and |
408 |
| - result.getEnd() = itemEnd |
409 |
| - ) |
410 |
| - or |
411 |
| - i > 0 and |
412 |
| - result.getRegex() = re and |
413 |
| - exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | |
414 |
| - result.getStart() = itemStart and |
415 |
| - re.item(itemStart, result.getEnd()) |
416 |
| - ) |
417 |
| - ) |
418 |
| - } |
419 |
| - |
420 | 424 | /**
|
421 | 425 | * An alternative term, that is, a term of the form `a|b`.
|
422 | 426 | *
|
@@ -1030,4 +1034,62 @@ module Impl {
|
1030 | 1034 |
|
1031 | 1035 | override string getPrimaryQLClass() { result = "RegExpBackRef" }
|
1032 | 1036 | }
|
| 1037 | + |
| 1038 | + class Top = RegExpParent; |
| 1039 | + |
| 1040 | + /** |
| 1041 | + * Holds if `term` is an escape class representing e.g. `\d`. |
| 1042 | + * `clazz` is which character class it represents, e.g. "d" for `\d`. |
| 1043 | + */ |
| 1044 | + predicate isEscapeClass(RegExpTerm term, string clazz) { |
| 1045 | + exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz) |
| 1046 | + } |
| 1047 | + |
| 1048 | + /** |
| 1049 | + * Holds if `term` is a possessive quantifier. |
| 1050 | + * As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. |
| 1051 | + */ |
| 1052 | + predicate isPossessive(RegExpQuantifier term) { none() } |
| 1053 | + |
| 1054 | + /** |
| 1055 | + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. |
| 1056 | + * Not yet implemented for Python. |
| 1057 | + */ |
| 1058 | + predicate matchesAnyPrefix(RegExpTerm term) { any() } |
| 1059 | + |
| 1060 | + /** |
| 1061 | + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. |
| 1062 | + * Not yet implemented for Python. |
| 1063 | + */ |
| 1064 | + predicate matchesAnySuffix(RegExpTerm term) { any() } |
| 1065 | + |
| 1066 | + /** |
| 1067 | + * Holds if the regular expression should not be considered. |
| 1068 | + * |
| 1069 | + * We make the pragmatic performance optimization to ignore regular expressions in files |
| 1070 | + * that does not belong to the project code (such as installed dependencies). |
| 1071 | + */ |
| 1072 | + predicate isExcluded(RegExpParent parent) { |
| 1073 | + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) |
| 1074 | + or |
| 1075 | + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so |
| 1076 | + // we explicitly exclude these. |
| 1077 | + count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 |
| 1078 | + } |
| 1079 | + |
| 1080 | + /** |
| 1081 | + * Holds if `root` has the `i` flag for case-insensitive matching. |
| 1082 | + */ |
| 1083 | + predicate isIgnoreCase(RegExpTerm root) { |
| 1084 | + root.isRootTerm() and |
| 1085 | + root.getLiteral().isIgnoreCase() |
| 1086 | + } |
| 1087 | + |
| 1088 | + /** |
| 1089 | + * Holds if `root` has the `s` flag for multi-line matching. |
| 1090 | + */ |
| 1091 | + predicate isDotAll(RegExpTerm root) { |
| 1092 | + root.isRootTerm() and |
| 1093 | + root.getLiteral().isDotAll() |
| 1094 | + } |
1033 | 1095 | }
|
0 commit comments