Skip to content

Commit b737bdb

Browse files
committed
add a Java implementation of RegexTreeViewSig
1 parent 20254df commit b737bdb

File tree

1 file changed

+87
-4
lines changed

1 file changed

+87
-4
lines changed

java/ql/lib/semmle/code/java/regex/RegexTreeView.qll

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
/** Provides a class hierarchy corresponding to a parse tree of regular expressions. */
22

3-
private import java
4-
private import semmle.code.java.regex.regex
3+
private import semmle.code.java.regex.regex as RE // importing under a namescape to avoid naming conflict for `Top`.
4+
private import codeql.regex.nfa.NfaUtils as NfaUtils
5+
// exporting as RegexTreeView, and in the top-level scope.
6+
import Impl as RegexTreeView
57
import Impl
68

79
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
8-
RegExpTerm getParsedRegExp(StringLiteral re) { result.getRegex() = re and result.isRootTerm() }
10+
RegExpTerm getParsedRegExp(RE::StringLiteral re) { result.getRegex() = re and result.isRootTerm() }
11+
12+
private class Regex = RE::Regex;
13+
14+
private class Location = RE::Location;
15+
16+
private class File = RE::File;
917

1018
/**
1119
* An element containing a regular expression term, that is, either
@@ -53,7 +61,10 @@ private newtype TRegExpParent =
5361
/** A back reference */
5462
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
5563

56-
module Impl {
64+
private import codeql.regex.RegexTreeView
65+
66+
/** An implementation that statisfies the RegexTreeView signature. */
67+
module Impl implements RegexTreeViewSig {
5768
/**
5869
* An element containing a regular expression term, that is, either
5970
* a string literal (parsed as a regular expression; the root of the parse tree)
@@ -547,6 +558,13 @@ module Impl {
547558
}
548559
}
549560

561+
/**
562+
* A word boundary, that is, a regular expression term of the form `\b`.
563+
*/
564+
class RegExpWordBoundary extends RegExpSpecialChar {
565+
RegExpWordBoundary() { this.getChar() = "\\b" }
566+
}
567+
550568
/**
551569
* Gets the hex number for the `hex` char.
552570
*/
@@ -1088,4 +1106,69 @@ module Impl {
10881106

10891107
override string getPrimaryQLClass() { result = "RegExpBackRef" }
10901108
}
1109+
1110+
class Top = RegExpParent;
1111+
1112+
/**
1113+
* Holds if `term` is an escape class representing e.g. `\d`.
1114+
* `clazz` is which character class it represents, e.g. "d" for `\d`.
1115+
*/
1116+
predicate isEscapeClass(RegExpTerm term, string clazz) {
1117+
term.(RegExpCharacterClassEscape).getValue() = clazz
1118+
or
1119+
term.(RegExpNamedProperty).getBackslashEquivalent() = clazz
1120+
}
1121+
1122+
/**
1123+
* Holds if `term` is a possessive quantifier, e.g. `a*+`.
1124+
*/
1125+
predicate isPossessive(RegExpQuantifier term) { term.isPossessive() }
1126+
1127+
/**
1128+
* Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against.
1129+
*/
1130+
predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() }
1131+
1132+
/**
1133+
* Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against.
1134+
*/
1135+
predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() }
1136+
1137+
/**
1138+
* Holds if the regular expression should not be considered.
1139+
*
1140+
* We make the pragmatic performance optimization to ignore regular expressions in files
1141+
* that do not belong to the project code (such as installed dependencies).
1142+
*/
1143+
predicate isExcluded(RegExpParent parent) {
1144+
not exists(parent.getRegex().getLocation().getFile().getRelativePath())
1145+
or
1146+
// Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so
1147+
// we explicitly exclude these.
1148+
strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
1149+
}
1150+
1151+
/**
1152+
* Holds if `root` has the `i` flag for case-insensitive matching.
1153+
*/
1154+
predicate isIgnoreCase(RegExpTerm root) {
1155+
root.isRootTerm() and
1156+
root.getLiteral().isIgnoreCase()
1157+
}
1158+
1159+
/**
1160+
* Gets the flags for `root`, or the empty string if `root` has no flags.
1161+
*/
1162+
deprecated string getFlags(RegExpTerm root) {
1163+
root.isRootTerm() and
1164+
result = root.getLiteral().getFlags()
1165+
}
1166+
1167+
/**
1168+
* Holds if `root` has the `s` flag for multi-line matching.
1169+
*/
1170+
predicate isDotAll(RegExpTerm root) {
1171+
root.isRootTerm() and
1172+
root.getLiteral().isDotAll()
1173+
}
10911174
}

0 commit comments

Comments
 (0)