Skip to content

Commit 89167da

Browse files
Model flow steps for lxml
1 parent 1266b24 commit 89167da

File tree

1 file changed

+143
-10
lines changed
  • python/ql/lib/semmle/python/frameworks

1 file changed

+143
-10
lines changed

python/ql/lib/semmle/python/frameworks/Lxml.qll

Lines changed: 143 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
private import python
1010
private import semmle.python.dataflow.new.DataFlow
11+
private import semmle.python.dataflow.new.TaintTracking
1112
private import semmle.python.Concepts
1213
private import semmle.python.ApiGraphs
1314
private import semmle.python.frameworks.data.ModelsAsData
@@ -68,16 +69,7 @@ module Lxml {
6869
*/
6970
class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode {
7071
XPathCall() {
71-
exists(API::Node parseResult |
72-
parseResult =
73-
etreeRef().getMember(["parse", "fromstring", "fromstringlist", "HTML", "XML"]).getReturn()
74-
or
75-
// TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
76-
// but we don't really have a way to model that nicely.
77-
parseResult = etreeRef().getMember("XMLParser").getReturn().getMember("close").getReturn()
78-
|
79-
this = parseResult.getMember("xpath").getACall()
80-
)
72+
this.(DataFlow::MethodCallNode).calls([Element::instance(), ElementTree::instance()], "xpath")
8173
}
8274

8375
override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] }
@@ -318,4 +310,145 @@ module Lxml {
318310

319311
override DataFlow::Node getAPathArgument() { result = this.getAnInput() }
320312
}
313+
314+
/** Provides models for instances of the `lxml.etree.Element` class. */
315+
module Element {
316+
/** Gets a reference to the `Element` class. */
317+
API::Node classRef() { result = etreeRef().getMember("Element") }
318+
319+
abstract class InstanceSource extends DataFlow::LocalSourceNode { }
320+
321+
/** Gets a reference to an `lxml.etree.Element` instance. */
322+
private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) {
323+
t.start() and
324+
result instanceof InstanceSource
325+
or
326+
exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t))
327+
}
328+
329+
/** Gets a reference to an `lxml.etree.Element` instance. */
330+
DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) }
331+
332+
/** An `Element` instantiated directly. */
333+
private class ElementInstance extends InstanceSource {
334+
ElementInstance() { this = classRef().getACall() }
335+
}
336+
337+
/** The result of a parse operation that returns an `Element`. */
338+
private class ParseResult extends InstanceSource, DataFlow::MethodCallNode {
339+
ParseResult() {
340+
this.calls(XmlParser::instance(_), "close")
341+
or
342+
// TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element`
343+
this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getACall()
344+
}
345+
}
346+
347+
/** An element index such as `etree.parse(...)[0]` */
348+
private class ElementIndex extends InstanceSource, DataFlow::Node {
349+
ElementIndex() { this.asExpr().(Subscript).getObject() = instance().asExpr() }
350+
}
351+
352+
/** A call to a method on an `Element` that returns another `Element`. */
353+
private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode {
354+
ElementMethod() {
355+
// TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others
356+
// an `Element` itself can be used as an iterator of its children.
357+
this.calls(instance(), ["find", "getnext", "getprevious", "getparent"])
358+
}
359+
}
360+
361+
/** A call to a method on an `ElementTree` that returns an `Element`. */
362+
private class ElementTreeMethod extends InstanceSource, DataFlow::MethodCallNode {
363+
ElementTreeMethod() { this.calls(ElementTree::instance(), "getroot") }
364+
}
365+
366+
/** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */
367+
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
368+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
369+
exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() |
370+
call.calls(nodeFrom,
371+
// We don't consider sibling nodes to be tainted (getnext, getprevious, itersiblings)
372+
[
373+
"cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator",
374+
"getparent", "getroottree", "items", "iter", "iterancestors", "iterchildren",
375+
"iterdescendants", "iterfind", "itertext", "keys", "values", "xpath"
376+
])
377+
)
378+
or
379+
exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() |
380+
attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "tag", "tail", "text"])
381+
)
382+
}
383+
}
384+
}
385+
386+
/** Provides models for instances of the `lxml.etree.ElementTree` class. */
387+
module ElementTree {
388+
API::Node classRef() { result = etreeRef().getMember("ElementTree") }
389+
390+
abstract class InstanceSource extends DataFlow::LocalSourceNode { }
391+
392+
/** Gets a reference to an `lxml.etree.ElementTree` instance.` */
393+
private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) {
394+
t.start() and
395+
result instanceof InstanceSource
396+
or
397+
exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t))
398+
}
399+
400+
/** Gets a reference to an `lxml.etree.ElementTree` parsers instance. */
401+
DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) }
402+
403+
/** An `ElementTree` instantiated directly. */
404+
private class ElementTreeInstance extends InstanceSource {
405+
ElementTreeInstance() { this = classRef().getACall() }
406+
}
407+
408+
/** The result of a parst operation that returns an `ElementTree` */
409+
private class ParseResult extends InstanceSource, DataFlow::MethodCallNode {
410+
ParseResult() { this = etreeRef().getMember("parse").getACall() }
411+
}
412+
413+
/** A call to a method on an `Element` that returns another `Element`. */
414+
private class ElementMethod extends InstanceSource, DataFlow::MethodCallNode {
415+
ElementMethod() {
416+
// TODO: methods that return iterators of `Element`s - `findall`, `finditer`, `iter`, a few others
417+
// an `Element` itself can be used as an iterator of its children.
418+
this.calls(Element::instance(), "getroottree")
419+
}
420+
}
421+
422+
/** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */
423+
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
424+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
425+
exists(DataFlow::MethodCallNode call | nodeTo = call and nodeFrom = instance() |
426+
call.calls(nodeFrom,
427+
[
428+
"find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind",
429+
"xpath"
430+
])
431+
)
432+
or
433+
exists(DataFlow::AttrRead attr | nodeTo = attr and nodeFrom = instance() |
434+
attr.accesses(nodeFrom, "docinfo")
435+
)
436+
}
437+
}
438+
}
439+
440+
/** A call to serialise xml to a string */
441+
private class XmlEncoding extends Encoding::Range, DataFlow::CallCfgNode {
442+
XmlEncoding() { this = etreeRef().getMember("tostring").getACall() }
443+
444+
override DataFlow::Node getAnInput() {
445+
result = [this.getArg(0), this.getArgByName("element_or_tree")]
446+
}
447+
448+
override DataFlow::Node getOutput() { result = this }
449+
450+
override string getFormat() { result = "XML" }
451+
}
452+
// TODO: ElementTree.write can write to a file-like object; should that be a flow step?
453+
// It also can accept a filepath which could be a path injection sink.
321454
}

0 commit comments

Comments
 (0)