Skip to content

Commit 1aeaefc

Browse files
committed
add a Python implementation of RegexTreeViewSig
1 parent 5fbcbbc commit 1aeaefc

File tree

1 file changed

+89
-27
lines changed

1 file changed

+89
-27
lines changed

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
import python
44
private import semmle.python.regex
5+
private import codeql.regex.nfa.NfaUtils as NfaUtils
6+
private import codeql.regex.RegexTreeView
7+
// exporting as RegexTreeView, and in the top-level scope.
8+
import Impl as RegexTreeView
59
import Impl
610

711
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
@@ -52,8 +56,34 @@ private newtype TRegExpParent =
5256
/** A back reference */
5357
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
5458

59+
pragma[nomagic]
60+
private int seqChildEnd(Regex re, int start, int end, int i) {
61+
result = seqChild(re, start, end, i).getEnd()
62+
}
63+
64+
// moved out so we can use it in the charpred
65+
private RegExpTerm seqChild(Regex re, int start, int end, int i) {
66+
re.sequence(start, end) and
67+
(
68+
i = 0 and
69+
result.getRegex() = re and
70+
result.getStart() = start and
71+
exists(int itemEnd |
72+
re.item(start, itemEnd) and
73+
result.getEnd() = itemEnd
74+
)
75+
or
76+
i > 0 and
77+
result.getRegex() = re and
78+
exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
79+
result.getStart() = itemStart and
80+
re.item(itemStart, result.getEnd())
81+
)
82+
)
83+
}
84+
5585
/** An implementation that statisfies the RegexTreeView signature. */
56-
module Impl {
86+
module Impl implements RegexTreeViewSig {
5787
/**
5888
* An element containing a regular expression term, that is, either
5989
* a string literal (parsed as a regular expression)
@@ -391,32 +421,6 @@ module Impl {
391421
override string getPrimaryQLClass() { result = "RegExpSequence" }
392422
}
393423

394-
pragma[nomagic]
395-
private int seqChildEnd(Regex re, int start, int end, int i) {
396-
result = seqChild(re, start, end, i).getEnd()
397-
}
398-
399-
// moved out so we can use it in the charpred
400-
private RegExpTerm seqChild(Regex re, int start, int end, int i) {
401-
re.sequence(start, end) and
402-
(
403-
i = 0 and
404-
result.getRegex() = re and
405-
result.getStart() = start and
406-
exists(int itemEnd |
407-
re.item(start, itemEnd) and
408-
result.getEnd() = itemEnd
409-
)
410-
or
411-
i > 0 and
412-
result.getRegex() = re and
413-
exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
414-
result.getStart() = itemStart and
415-
re.item(itemStart, result.getEnd())
416-
)
417-
)
418-
}
419-
420424
/**
421425
* An alternative term, that is, a term of the form `a|b`.
422426
*
@@ -1030,4 +1034,62 @@ module Impl {
10301034

10311035
override string getPrimaryQLClass() { result = "RegExpBackRef" }
10321036
}
1037+
1038+
class Top = RegExpParent;
1039+
1040+
/**
1041+
* Holds if `term` is an escape class representing e.g. `\d`.
1042+
* `clazz` is which character class it represents, e.g. "d" for `\d`.
1043+
*/
1044+
predicate isEscapeClass(RegExpTerm term, string clazz) {
1045+
exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz)
1046+
}
1047+
1048+
/**
1049+
* Holds if `term` is a possessive quantifier.
1050+
* As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library.
1051+
*/
1052+
predicate isPossessive(RegExpQuantifier term) { none() }
1053+
1054+
/**
1055+
* Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against.
1056+
* Not yet implemented for Python.
1057+
*/
1058+
predicate matchesAnyPrefix(RegExpTerm term) { any() }
1059+
1060+
/**
1061+
* Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against.
1062+
* Not yet implemented for Python.
1063+
*/
1064+
predicate matchesAnySuffix(RegExpTerm term) { any() }
1065+
1066+
/**
1067+
* Holds if the regular expression should not be considered.
1068+
*
1069+
* We make the pragmatic performance optimization to ignore regular expressions in files
1070+
* that does not belong to the project code (such as installed dependencies).
1071+
*/
1072+
predicate isExcluded(RegExpParent parent) {
1073+
not exists(parent.getRegex().getLocation().getFile().getRelativePath())
1074+
or
1075+
// Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so
1076+
// we explicitly exclude these.
1077+
count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
1078+
}
1079+
1080+
/**
1081+
* Holds if `root` has the `i` flag for case-insensitive matching.
1082+
*/
1083+
predicate isIgnoreCase(RegExpTerm root) {
1084+
root.isRootTerm() and
1085+
root.getLiteral().isIgnoreCase()
1086+
}
1087+
1088+
/**
1089+
* Holds if `root` has the `s` flag for multi-line matching.
1090+
*/
1091+
predicate isDotAll(RegExpTerm root) {
1092+
root.isRootTerm() and
1093+
root.getLiteral().isDotAll()
1094+
}
10331095
}

0 commit comments

Comments
 (0)