Skip to content

Commit 2019ddf

Browse files
Qldoc improvements + add a few extra tests
1 parent 5c8ef28 commit 2019ddf

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

python/ql/lib/semmle/python/frameworks/Lxml.qll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ module Lxml {
6969
*/
7070
class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode {
7171
XPathCall() {
72+
// TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
73+
// but we don't really have a way to model that nicely.
7274
this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall()
7375
}
7476

@@ -201,9 +203,10 @@ module Lxml {
201203
* A call to either of:
202204
* - `lxml.etree.fromstring`
203205
* - `lxml.etree.fromstringlist`
204-
* -
206+
* - `lxml.etree.HTML`
205207
* - `lxml.etree.XML`
206208
* - `lxml.etree.XMLID`
209+
* - `lxml.etree.XMLDTDID`
207210
* - `lxml.etree.parse`
208211
* - `lxml.etree.parseid`
209212
*
@@ -329,7 +332,7 @@ module Lxml {
329332
* calls, or a special parameter that will be set when functions are called by an external
330333
* library.
331334
*
332-
* Use the predicate `Element::instance()` to get references to instances of `lxml.etree.ElementTree` instances.
335+
* Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances.
333336
*/
334337
abstract class InstanceSource instanceof API::Node {
335338
/** Gets a textual representation of this element. */
@@ -354,7 +357,8 @@ module Lxml {
354357
etreeRef().getMember("get_default_parser").getReturn()
355358
].getMember("close").getReturn()
356359
or
357-
// TODO: `XMLID` and `parseid` returns a tuple of which the first element is an `Element`
360+
// TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`.
361+
// `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`.
358362
this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn()
359363
}
360364
}
@@ -393,15 +397,18 @@ module Lxml {
393397
}
394398
}
395399

396-
/** An additional taint step from an `Element` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase */
400+
/**
401+
* An additional taint step from an `Element` instance.
402+
* See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase.
403+
*/
397404
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
398405
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
399406
exists(DataFlow::MethodCallNode call |
400407
nodeTo = call and instance().asSource().flowsTo(nodeFrom)
401408
|
402409
call.calls(nodeFrom,
403-
// We consider a node to be tainted if there could be taint anywhere in the element tree
404-
// So sibling nodes (e.g. `getnext`) are also tainted
410+
// We consider a node to be tainted if there could be taint anywhere in the element tree;
411+
// So sibling nodes (e.g. `getnext`) are also tainted.
405412
// This ensures nodes like `elem[0].getnext()` are tracked.
406413
[
407414
"cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator",
@@ -445,7 +452,7 @@ module Lxml {
445452
ElementTreeInstance() { this = classRef().getAnInstance() }
446453
}
447454

448-
/** The result of a parst operation that returns an `ElementTree` */
455+
/** The result of a parst operation that returns an `ElementTree`. */
449456
private class ParseResult extends InstanceSource {
450457
ParseResult() { this = etreeRef().getMember("parse").getReturn() }
451458
}

python/ql/test/library-tests/frameworks/lxml/taint_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def test():
3232
elem, # $ tainted
3333
ET.tostring(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostring(..)
3434
ET.tostringlist(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tostringlist(..)
35+
ET.tounicode(elem), # $ tainted encodeFormat=XML encodeInput=elem encodeOutput=ET.tounicode(..)
3536
elem.attrib, # $ tainted
3637
elem.base, # $ tainted
3738
elem.nsmap, # $ tainted
@@ -82,7 +83,7 @@ def test():
8283
)
8384

8485
buf = io.StringIO(src)
85-
tree = ET.parse(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parse(..) SPURIOUS:getAPathArgument=buf # Spurious as this is used as a file-like objectt, not a path
86+
tree = ET.parse(buf) # $ decodeFormat=XML decodeInput=buf xmlVuln='XXE' decodeOutput=ET.parse(..) SPURIOUS:getAPathArgument=buf # Spurious as this is used as a file-like object, not a path
8687
ensure_tainted(
8788
tree, # $ tainted
8889
tree.getroot().text, # $ tainted
@@ -94,6 +95,7 @@ def test():
9495
next(tree.iter()).text, # $ MISSING:tainted
9596
tree.iterfind("b"), # $ tainted
9697
next(tree.iterfind("b")).text, # $ MISSING:tainted
98+
tree.xpath("b")[0].text, # $ tainted getXPath="b"
9799
)
98100

99101
(elem2, ids) = ET.XMLID(src) # $ decodeFormat=XML decodeInput=src xmlVuln='XXE' decodeOutput=ET.XMLID(..)

0 commit comments

Comments
 (0)