Skip to content

Commit bc17bf6

Browse files
authored
Merge pull request github#14317 from yoff/python/fix-regex-string-part-locations
Python: Improve computation of regex fragments inside string parts
2 parents eb2db59 + 8ade9ed commit bc17bf6

File tree

5 files changed

+78
-5
lines changed

5 files changed

+78
-5
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Regular expression fragments residing inside implicitly concatenated strings now have better location information.

python/ql/lib/semmle/python/AstExtended.qll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,28 @@ class StringPart extends StringPart_, AstNode {
154154
override string toString() { result = StringPart_.super.toString() }
155155

156156
override Location getLocation() { result = StringPart_.super.getLocation() }
157+
158+
/**
159+
* Holds if the content of string `StringPart` is surrounded by
160+
* a prefix (including a quote) of length `prefixLength` and
161+
* a quote of length `quoteLength`.
162+
*/
163+
predicate contextSize(int prefixLength, int quoteLength) {
164+
exists(int occurrenceOffset |
165+
quoteLength = this.getText().regexpFind("\"{3}|\"{1}|'{3}|'{1}", 0, occurrenceOffset).length() and
166+
prefixLength = occurrenceOffset + quoteLength
167+
)
168+
}
169+
170+
/**
171+
* Gets the length of the content, that is the text between the prefix and the quote.
172+
* See `context` for obtaining the prefix and the quote.
173+
*/
174+
int getContentLength() {
175+
exists(int prefixLength, int quoteLength | this.contextSize(prefixLength, quoteLength) |
176+
result = this.getText().length() - prefixLength - quoteLength
177+
)
178+
}
157179
}
158180

159181
class StringPartList extends StringPartList_ { }

python/ql/lib/semmle/python/regexp/RegexTreeView.qll

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,16 +223,55 @@ module Impl implements RegexTreeViewSig {
223223
*/
224224
Location getLocation() { result = re.getLocation() }
225225

226+
/** Gets the accumulated length of string parts with lower index than `index`, if any. */
227+
private int getPartOffset(int index) {
228+
index = 0 and result = 0
229+
or
230+
index > 0 and
231+
exists(int previousOffset | previousOffset = this.getPartOffset(index - 1) |
232+
result =
233+
previousOffset + re.(StrConst).getImplicitlyConcatenatedPart(index - 1).getContentLength()
234+
)
235+
}
236+
237+
/**
238+
* Gets the `StringPart` in which this `RegExpTerm` resides, if any.
239+
* `localOffset` will be the offset of this `RegExpTerm` inside `result`.
240+
*/
241+
StringPart getPart(int localOffset) {
242+
exists(int index, int prefixLength | index = max(int i | this.getPartOffset(i) <= start) |
243+
result = re.(StrConst).getImplicitlyConcatenatedPart(index) and
244+
result.contextSize(prefixLength, _) and
245+
// Example:
246+
// re.compile('...' r"""...this..""")
247+
// - `start` is the offset from `(` to `this` as counted after concatenating all parts.
248+
// - we subtract the length of the previous `StringPart`s, `'...'`, to know how far into this `StringPart` we go.
249+
// - as the prefix 'r"""' is part of the `StringPart`, `this` is found that much further in.
250+
localOffset = start - this.getPartOffset(index) + prefixLength
251+
)
252+
}
253+
226254
/** Holds if this term is found at the specified location offsets. */
227255
predicate hasLocationInfo(
228256
string filepath, int startline, int startcolumn, int endline, int endcolumn
229257
) {
258+
not exists(this.getPart(_)) and
230259
exists(int re_start, int prefix_len | prefix_len = re.getPrefix().length() |
231-
re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, _) and
260+
re.getLocation().hasLocationInfo(filepath, startline, re_start, _, _) and
232261
startcolumn = re_start + start + prefix_len and
262+
endline = startline and
233263
endcolumn = re_start + end + prefix_len - 1
234264
/* inclusive vs exclusive */
235265
)
266+
or
267+
exists(StringPart part, int localOffset, int partStartColumn |
268+
part = this.getPart(localOffset)
269+
|
270+
part.getLocation().hasLocationInfo(filepath, startline, partStartColumn, _, _) and
271+
startcolumn = partStartColumn + localOffset and
272+
endline = startline and
273+
endcolumn = (end - start) + startcolumn
274+
)
236275
}
237276

238277
/** Gets the file in which this term is found. */

python/ql/test/library-tests/regexparser/locations.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,31 @@
5050
)
5151

5252
# plain string with multiple parts
53-
re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=1:26
53+
re.compile( # $ location=1:2 location=1:26
5454
'[this] is a test' ' and [this] is another test'
5555
)
5656

5757
# plain string with multiple parts across lines
58-
re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=2:7
58+
re.compile( # $ location=1:2 location=2:7 location=3:2
5959
'[this] is a test'
6060
' and [this] is another test'
61+
'[this] comes right at the start of a part'
6162
)
6263

6364
# plain string with multiple parts across lines and comments
64-
re.compile( # $ location=1:2 SPURIOUS:location=1:23 MISSING:location=3:7
65+
re.compile( # $ location=1:2 location=3:7
6566
'[this] is a test'
6667
# comment
6768
' and [this] is another test'
6869
)
6970

71+
# multiple parts of different kinds
72+
re.compile( # $ location=1:2 location=1:28 location=2:11 location=3:8
73+
'[this] is a test' ''' and [this] is another test'''
74+
br""" and [this] is yet another test"""
75+
r' and [this] is one more'
76+
)
77+
7078
# actual multiline string
7179
re.compile( # $ SPURIOUS:location=1:6 location=1:27 MISSING:location=2:1 location=3:5
7280
r'''

python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
| KnownCVEs.py:15:20:15:22 | \\d+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
2-
| KnownCVEs.py:30:21:31:22 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ','. |
2+
| KnownCVEs.py:30:21:30:23 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ','. |
33
| KnownCVEs.py:35:18:35:81 | ([-/:,#%.'"\\s!\\w]\|\\w-\\w\|'[\\s\\w]+'\\s*\|"[\\s\\w]+"\|\\([\\d,%\\.\\s]+\\))* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '"\\t"'. |
44
| redos.py:6:28:6:42 | (?:__\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '__'. |
55
| redos.py:6:52:6:68 | (?:\\*\\*\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings starting with '*' and containing many repetitions of '**'. |

0 commit comments

Comments
 (0)