Skip to content

Commit 29a9023

Browse files
Improve tests and use API graphs
1 parent d2ed92d commit 29a9023

File tree

3 files changed

+156
-102
lines changed

3 files changed

+156
-102
lines changed

python/ql/lib/semmle/python/frameworks/Lxml.qll

Lines changed: 73 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ module Lxml {
6969
*/
7070
class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode {
7171
XPathCall() {
72-
this.(DataFlow::MethodCallNode).calls([Element::instance(), ElementTree::instance()], "xpath")
72+
this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall()
7373
}
7474

7575
override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] }
@@ -201,6 +201,7 @@ module Lxml {
201201
* A call to either of:
202202
* - `lxml.etree.fromstring`
203203
* - `lxml.etree.fromstringlist`
204+
* -
204205
* - `lxml.etree.XML`
205206
* - `lxml.etree.XMLID`
206207
* - `lxml.etree.parse`
@@ -209,23 +210,27 @@ module Lxml {
209210
* See
210211
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring
211212
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist
213+
* - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML
212214
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML
213215
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID
216+
* - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID
214217
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse
215218
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid
216219
*/
217220
private class LxmlParsing extends DataFlow::CallCfgNode, XML::XmlParsing::Range {
218221
string functionName;
219222

220223
LxmlParsing() {
221-
functionName in ["fromstring", "fromstringlist", "XML", "XMLID", "parse", "parseid"] and
224+
functionName in [
225+
"fromstring", "fromstringlist", "HTML", "XML", "XMLID", "XMLDTDID", "parse", "parseid"
226+
] and
222227
this = etreeRef().getMember(functionName).getACall()
223228
}
224229

225230
override DataFlow::Node getAnInput() {
226231
result in [
227232
this.getArg(0),
228-
// fromstring / XML / XMLID
233+
// fromstring / HTML / XML / XMLID / XMLDTDID
229234
this.getArgByName("text"),
230235
// fromstringlist
231236
this.getArgByName("strings"),
@@ -240,7 +245,8 @@ module Lxml {
240245
this.getParserArg() = XmlParser::instanceVulnerableTo(kind)
241246
or
242247
kind.isXxe() and
243-
not exists(this.getParserArg())
248+
not exists(this.getParserArg()) and
249+
not functionName = "HTML"
244250
}
245251

246252
override predicate mayExecuteInput() { none() }
@@ -314,78 +320,97 @@ module Lxml {
314320
/** Provides models for instances of the `lxml.etree.Element` class. */
315321
module Element {
316322
/** Gets a reference to the `Element` class. */
317-
API::Node classRef() { result = etreeRef().getMember("Element") }
318-
319-
abstract class InstanceSource extends DataFlow::LocalSourceNode { }
323+
API::Node classRef() { result = etreeRef().getMember(["Element", "_Element"]) }
320324

321-
/** Gets a reference to an `lxml.etree.Element` instance. */
322-
private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) {
323-
t.start() and
324-
result instanceof InstanceSource
325-
or
326-
exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t))
325+
abstract class InstanceSource instanceof API::Node {
326+
string toString() { result = super.toString() }
327327
}
328328

329329
/** Gets a reference to an `lxml.etree.Element` instance. */
330-
DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) }
330+
API::Node instance() { result instanceof InstanceSource }
331331

332332
/** An `Element` instantiated directly. */
333333
private class ElementInstance extends InstanceSource {
334-
ElementInstance() { this = classRef().getACall() }
334+
ElementInstance() { this = classRef().getAnInstance() }
335335
}
336336

337337
/** The result of a parse operation that returns an `Element`. */
338-
private class ParseResult extends InstanceSource, DataFlow::MethodCallNode {
338+
private class ParseResult extends InstanceSource {
339339
ParseResult() {
340-
this.calls(XmlParser::instance(_), "close")
340+
// TODO: The XmlParser module does not currently use API graphs
341+
this =
342+
[
343+
etreeRef().getMember("XMLParser").getAnInstance(),
344+
etreeRef().getMember("get_default_parser").getReturn()
345+
].getMember("close").getReturn()
341346
or
342347
// TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element`
343-
this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getACall()
348+
this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn()
344349
}
345350
}
346351

347-
/** An element index such as `etree.parse(...)[0]` */
348-
private class ElementIndex extends InstanceSource, DataFlow::Node {
349-
ElementIndex() { this.asExpr().(Subscript).getObject() = instance().asExpr() }
350-
}
351-
352352
/** A call to a method on an `Element` that returns another `Element`. */
353-
private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode {
353+
private class ElementMethod extends InstanceSource {
354354
ElementMethod() {
355-
// TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others
356-
// an `Element` itself can be used as an iterator of its children.
357-
this.calls(instance(), ["find", "getnext", "getprevious", "getparent"])
355+
// an Element is an iterator of Elements
356+
this = instance().getASubscript()
357+
or
358+
// methods that return an Element
359+
this = instance().getMember(["find", "getnext", "getprevious", "getparent"]).getReturn()
360+
or
361+
// methods that return an iterator of Elements
362+
this =
363+
instance()
364+
.getMember([
365+
"cssselect", "findall", "getchildren", "getiterator", "iter", "iterancestors",
366+
"iterdecendants", "iterchildren", "itersiblings", "iterfind", "xpath"
367+
])
368+
.getReturn()
369+
.getASubscript()
358370
}
359371
}
360372

361373
/** A call to a method on an `ElementTree` that returns an `Element`. */
362-
private class ElementTreeMethod extends InstanceSource, DataFlow::MethodCallNode {
363-
ElementTreeMethod() { this.calls(ElementTree::instance(), "getroot") }
374+
private class ElementTreeMethod extends InstanceSource {
375+
ElementTreeMethod() {
376+
this = ElementTree::instance().getMember(["getroot", "find"]).getReturn()
377+
or
378+
this =
379+
ElementTree::instance()
380+
.getMember(["findall", "getiterator", "iter", "iterfind", "xpath"])
381+
.getReturn()
382+
.getASubscript()
383+
}
364384
}
365385

366386
/** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */
367387
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
368388
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
369-
exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() |
389+
exists(DataFlow::MethodCallNode call |
390+
nodeTo = call and instance().asSource().flowsTo(nodeFrom)
391+
|
370392
call.calls(nodeFrom,
371-
// We don't consider sibling nodes to be tainted (getnext, getprevious, itersiblings)
393+
// We consider a node to be tainted if there could be taint anywhere in the element tree
394+
// So sibling nodes (e.g. `getnext`) are also tainted
395+
// This ensures nodes like `elem[0].getnext()` are tracked.
372396
[
373397
"cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator",
374-
"getparent", "getroottree", "items", "iter", "iterancestors", "iterchildren",
375-
"iterdescendants", "iterfind", "itertext", "keys", "values", "xpath"
398+
"getnext", "getparent", "getprevious", "getroottree", "items", "iter",
399+
"iterancestors", "iterchildren", "iterdescendants", "itersiblings", "iterfind",
400+
"itertext", "keys", "values", "xpath"
376401
])
377402
)
378403
or
379-
exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() |
380-
attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "tag", "tail", "text"])
404+
exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) |
405+
attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "prefix", "tag", "tail", "text"])
381406
)
382407
}
383408
}
384409
}
385410

386411
/** Provides models for instances of the `lxml.etree.ElementTree` class. */
387412
module ElementTree {
388-
API::Node classRef() { result = etreeRef().getMember("ElementTree") }
413+
API::Node classRef() { result = etreeRef().getMember(["ElementTree", "_ElementTree"]) }
389414

390415
/**
391416
* A source of instances of `lxml.etree.ElementTree` instances, extend this class to model new instances.
@@ -396,50 +421,42 @@ module Lxml {
396421
*
397422
* Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances.
398423
*/
399-
abstract class InstanceSource extends DataFlow::LocalSourceNode { }
400-
401-
/** Gets a reference to an `lxml.etree.ElementTree` instance.` */
402-
private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) {
403-
t.start() and
404-
result instanceof InstanceSource
405-
or
406-
exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t))
424+
abstract class InstanceSource instanceof API::Node {
425+
string toString() { result = super.toString() }
407426
}
408427

409428
/** Gets a reference to an `lxml.etree.ElementTree` instance. */
410-
DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) }
429+
API::Node instance() { result instanceof InstanceSource }
411430

412431
/** An `ElementTree` instantiated directly. */
413432
private class ElementTreeInstance extends InstanceSource {
414-
ElementTreeInstance() { this = classRef().getACall() }
433+
ElementTreeInstance() { this = classRef().getAnInstance() }
415434
}
416435

417436
/** The result of a parst operation that returns an `ElementTree` */
418-
private class ParseResult extends InstanceSource, DataFlow::MethodCallNode {
419-
ParseResult() { this = etreeRef().getMember("parse").getACall() }
437+
private class ParseResult extends InstanceSource {
438+
ParseResult() { this = etreeRef().getMember("parse").getReturn() }
420439
}
421440

422441
/** A call to a method on an `Element` that returns another `Element`. */
423-
private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode {
424-
ElementMethod() {
425-
// TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others
426-
// an `Element` itself can be used as an iterator of its children.
427-
this.calls(Element::instance(), "getroottree")
428-
}
442+
private class ElementMethod extends InstanceSource {
443+
ElementMethod() { this = Element::instance().getMember("getroottree").getReturn() }
429444
}
430445

431446
/** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */
432447
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
433448
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
434-
exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() |
449+
exists(DataFlow::MethodCallNode call |
450+
nodeTo = call and instance().asSource().flowsTo(nodeFrom)
451+
|
435452
call.calls(nodeFrom,
436453
[
437454
"find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind",
438455
"xpath"
439456
])
440457
)
441458
or
442-
exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() |
459+
exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) |
443460
attr.accesses(nodeFrom, "docinfo")
444461
)
445462
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
argumentToEnsureNotTaintedNotMarkedAsSpurious
2+
untaintedArgumentToEnsureTaintedNotMarkedAsMissing
3+
testFailures
4+
failures

0 commit comments

Comments
 (0)