Skip to content

Commit 6d24591

Browse files
committed
Revert "Python: switch to shared implementation of IncompleteHostnameRegExp.ql"
This reverts commit ce50f35.
1 parent 1a51f0c commit 6d24591

File tree

6 files changed

+31
-250
lines changed

6 files changed

+31
-250
lines changed

config/identical-files.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,6 @@
518518
],
519519
"Hostname Regexp queries": [
520520
"javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
521-
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll",
522-
"python/ql/src/Security/CWE-020/HostnameRegexpShared.qll"
521+
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
523522
]
524523
}

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import python
44
private import semmle.python.regex
5-
private import semmle.python.dataflow.new.DataFlow
65

76
/**
87
* An element containing a regular expression term, that is, either
@@ -49,19 +48,6 @@ newtype TRegExpParent =
4948
/** A back reference */
5049
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
5150

52-
/**
53-
* Provides utility predicates related to regular expressions.
54-
*/
55-
module RegExpPatterns {
56-
/**
57-
* Gets a pattern that matches common top-level domain names in lower case.
58-
*/
59-
string getACommonTld() {
60-
// according to ranking by http://google.com/search?q=site:.<<TLD>>
61-
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
62-
}
63-
}
64-
6551
/**
6652
* An element containing a regular expression term, that is, either
6753
* a string literal (parsed as a regular expression)
@@ -459,8 +445,6 @@ class RegExpAlt extends RegExpTerm, TRegExpAlt {
459445
override string getPrimaryQLClass() { result = "RegExpAlt" }
460446
}
461447

462-
class RegExpCharEscape = RegExpEscape;
463-
464448
/**
465449
* An escaped regular expression term, that is, a regular expression
466450
* term starting with a backslash, which is not a backreference.
@@ -767,9 +751,6 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
767751
*/
768752
int getNumber() { result = re.getGroupNumber(start, end) }
769753

770-
/** Holds if this is a capture group. */
771-
predicate isCapture() { exists(this.getNumber()) }
772-
773754
/** Holds if this is a named capture group. */
774755
predicate isNamed() { exists(this.getName()) }
775756

@@ -1028,24 +1009,3 @@ class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
10281009

10291010
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
10301011
RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }
1031-
1032-
/**
1033-
* A node whose value may flow to a position where it is interpreted
1034-
* as a part of a regular expression.
1035-
*/
1036-
class RegExpPatternSource extends DataFlow::CfgNode {
1037-
private Regex astNode;
1038-
1039-
RegExpPatternSource() { astNode = this.asExpr() }
1040-
1041-
/**
1042-
* Gets a node where the pattern of this node is parsed as a part of
1043-
* a regular expression.
1044-
*/
1045-
DataFlow::Node getAParse() { result = this }
1046-
1047-
/**
1048-
* Gets the root term of the regular expression parsed from this pattern.
1049-
*/
1050-
RegExpTerm getRegExpTerm() { result.getRegex() = astNode }
1051-
}

python/ql/src/Security/CWE-020/HostnameRegexpShared.qll

Lines changed: 0 additions & 202 deletions
This file was deleted.

python/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll

Lines changed: 0 additions & 2 deletions
This file was deleted.

python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,35 @@
88
* @id py/incomplete-hostname-regexp
99
* @tags correctness
1010
* security
11-
* external/cwe/cwe-020
11+
* external/cwe/cwe-20
1212
*/
1313

14-
import HostnameRegexpShared
14+
import python
15+
import semmle.python.regex
1516

16-
query predicate problems = incompleteHostnameRegExp/4;
17+
private string commonTopLevelDomainRegex() { result = "com|org|edu|gov|uk|net|io" }
18+
19+
/**
20+
* Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
21+
* and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
22+
*/
23+
bindingset[pattern]
24+
predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
25+
hostPart =
26+
pattern
27+
.regexpCapture("(?i).*" +
28+
// an unescaped single `.`
29+
"(?<!\\\\)[.]" +
30+
// immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
31+
"([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
32+
}
33+
34+
from Regex r, string pattern, string hostPart
35+
where
36+
r.getText() = pattern and
37+
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
38+
// ignore patterns with capture groups after the TLD
39+
not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex() + ").*[(][?]:.*[)].*")
40+
select r,
41+
"This regular expression has an unescaped '.' before '" + hostPart +
42+
"', so it might match more hosts than expected."
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
| hosttest.py:6:31:6:53 | (www\|beta).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | hosttest.py:6:27:6:51 | ControlFlowNode for Str | here |
1+
| hosttest.py:6:27:6:51 | Str | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |

0 commit comments

Comments
 (0)