Skip to content

Commit 2f8ddda

Browse files
authored
Merge pull request github#11570 from Sim4n6/UnsafeUnpack
Python: Unsafe unpacking using `shutil.unpack_archive()` query and tests
2 parents 2f6ffdd + d7af801 commit 2f8ddda

File tree

17 files changed

+612
-0
lines changed

17 files changed

+612
-0
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>Extracting files from a malicious tarball without validating that the destination file path
8+
is within the destination directory using <code>shutil.unpack_archive()</code> can cause files outside the
9+
destination directory to be overwritten, due to the possible presence of directory traversal elements
10+
(<code>..</code>) in archive path names.</p>
11+
12+
<p>Tarball contain archive entries representing each file in the archive. These entries
13+
include a file path for the entry, but these file paths are not restricted and may contain
14+
unexpected special elements such as the directory traversal element (<code>..</code>). If these
15+
file paths are used to determine an output file to write the contents of the archive item to, then
16+
the file may be written to an unexpected location. This can result in sensitive information being
17+
revealed or deleted, or an attacker being able to influence behavior by modifying unexpected
18+
files.</p>
19+
20+
<p>For example, if a tarball contains a file entry <code>../sneaky-file.txt</code>, and the tarball
21+
is extracted to the directory <code>/tmp/tmp123</code>, then naively combining the paths would result
22+
in an output file path of <code>/tmp/tmp123/../sneaky-file.txt</code>, which would cause the file to be
23+
written to <code>/tmp/</code>.</p>
24+
25+
</overview>
26+
<recommendation>
27+
28+
<p>Ensure that output paths constructed from tarball entries are validated
29+
to prevent writing files to unexpected locations.</p>
30+
31+
<p>Consider using a safer module, such as: <code>zipfile</code></p>
32+
33+
</recommendation>
34+
35+
<example>
36+
<p>
37+
In this example an archive is extracted without validating file paths.
38+
</p>
39+
40+
<sample src="examples/HIT_UnsafeUnpack.py" />
41+
42+
<p>To fix this vulnerability, we need to call the function <code>tarfile.extract()</code>
43+
on each <code>member</code> after verifying that it does not contain either <code>..</code> or startswith <code>/</code>.
44+
</p>
45+
46+
<sample src="examples/NoHIT_UnsafeUnpack.py" />
47+
48+
</example>
49+
<references>
50+
51+
<li>
52+
Shutil official documentation
53+
<a href="https://docs.python.org/3/library/shutil.html?highlight=unpack_archive#shutil.unpack_archive">shutil.unpack_archive() warning.</a>
54+
</li>
55+
</references>
56+
</qhelp>
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/**
2+
* @name Arbitrary file write during a tarball extraction from a user controlled source
3+
* @description Extracting files from a potentially malicious tarball using `shutil.unpack_archive()` without validating
4+
* that the destination file path is within the destination directory can cause files outside
5+
* the destination directory to be overwritten. More precisely, if the tarball comes from a user controlled
6+
* location either a remote one or cli argument.
7+
* @kind path-problem
8+
* @id py/unsafe-unpacking
9+
* @problem.severity error
10+
* @security-severity 7.5
11+
* @precision high
12+
* @tags security
13+
* experimental
14+
* external/cwe/cwe-022
15+
*/
16+
17+
import python
18+
import experimental.Security.UnsafeUnpackQuery
19+
import DataFlow::PathGraph
20+
21+
from UnsafeUnpackingConfig config, DataFlow::PathNode source, DataFlow::PathNode sink
22+
where config.hasFlowPath(source, sink)
23+
select sink.getNode(), source, sink,
24+
"Unsafe extraction from a malicious tarball retrieved from a remote location."
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import requests
2+
import shutil
3+
4+
url = "https://www.someremote.location/tarball.tar.gz"
5+
response = requests.get(url, stream=True)
6+
7+
tarpath = "/tmp/tmp456/tarball.tar.gz"
8+
with open(tarpath, "wb") as f:
9+
f.write(response.raw.read())
10+
11+
untarredpath = "/tmp/tmp123"
12+
shutil.unpack_archive(tarpath, untarredpath)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import requests
2+
import tarfile
3+
4+
url = "https://www.someremote.location/tarball.tar.gz"
5+
response = requests.get(url, stream=True)
6+
7+
tarpath = "/tmp/tmp456/tarball.tar.gz"
8+
with open(tarpath, "wb") as f:
9+
f.write(response.raw.read())
10+
11+
untarredpath = "/tmp/tmp123"
12+
with tarfile.open(tarpath) as tar:
13+
for member in tar.getmembers():
14+
if member.name.startswith("/") or ".." in member.name:
15+
raise Exception("Path traversal identified in tarball")
16+
17+
tar.extract(untarredpath, member)
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
/**
2+
* Provides a taint-tracking configuration for detecting "UnsafeUnpacking" vulnerabilities.
3+
*/
4+
5+
import python
6+
import semmle.python.Concepts
7+
import semmle.python.dataflow.new.internal.DataFlowPublic
8+
import semmle.python.ApiGraphs
9+
import semmle.python.dataflow.new.TaintTracking
10+
import semmle.python.frameworks.Stdlib
11+
import semmle.python.dataflow.new.RemoteFlowSources
12+
13+
/**
14+
* Handle those three cases of Tarfile opens:
15+
* - `tarfile.open()`
16+
* - `tarfile.TarFile()`
17+
* - `MKtarfile.Tarfile.open()`
18+
*/
19+
API::Node tarfileOpen() {
20+
result in [
21+
API::moduleImport("tarfile").getMember(["open", "TarFile"]),
22+
API::moduleImport("tarfile").getMember("TarFile").getASubclass().getMember("open")
23+
]
24+
}
25+
26+
/**
27+
* A class for handling the previous three cases, plus the use of `closing` in with the previous cases
28+
*/
29+
class AllTarfileOpens extends API::CallNode {
30+
AllTarfileOpens() {
31+
this = tarfileOpen().getACall()
32+
or
33+
exists(API::Node closing, Node arg |
34+
closing = API::moduleImport("contextlib").getMember("closing") and
35+
this = closing.getACall() and
36+
arg = this.getArg(0) and
37+
arg = tarfileOpen().getACall()
38+
)
39+
}
40+
}
41+
42+
class UnsafeUnpackingConfig extends TaintTracking::Configuration {
43+
UnsafeUnpackingConfig() { this = "UnsafeUnpackingConfig" }
44+
45+
override predicate isSource(DataFlow::Node source) {
46+
// A source coming from a remote location
47+
source instanceof RemoteFlowSource
48+
or
49+
// A source coming from a CLI argparse module
50+
// see argparse: https://docs.python.org/3/library/argparse.html
51+
exists(MethodCallNode args |
52+
args = source.(AttrRead).getObject().getALocalSource() and
53+
args =
54+
API::moduleImport("argparse")
55+
.getMember("ArgumentParser")
56+
.getReturn()
57+
.getMember("parse_args")
58+
.getACall()
59+
)
60+
or
61+
// A source catching an S3 file download
62+
// see boto3: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.download_file
63+
source =
64+
API::moduleImport("boto3")
65+
.getMember("client")
66+
.getReturn()
67+
.getMember(["download_file", "download_fileobj"])
68+
.getACall()
69+
.getArg(2)
70+
or
71+
// A source catching an S3 file download
72+
// see boto3: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html
73+
source =
74+
API::moduleImport("boto3")
75+
.getMember("Session")
76+
.getReturn()
77+
.getMember("client")
78+
.getReturn()
79+
.getMember(["download_file", "download_fileobj"])
80+
.getACall()
81+
.getArg(2)
82+
or
83+
// A source download a file using wget
84+
// see wget: https://pypi.org/project/wget/
85+
exists(API::CallNode mcn |
86+
mcn = API::moduleImport("wget").getMember("download").getACall() and
87+
if exists(mcn.getArg(1)) then source = mcn.getArg(1) else source = mcn.getReturn().asSource()
88+
)
89+
or
90+
// catch the Django uploaded files as a source
91+
// see HttpRequest.FILES: https://docs.djangoproject.com/en/4.1/ref/request-response/#django.http.HttpRequest.FILES
92+
source.(AttrRead).getAttributeName() = "FILES"
93+
}
94+
95+
override predicate isSink(DataFlow::Node sink) {
96+
(
97+
// A sink capturing method calls to `unpack_archive`.
98+
sink = API::moduleImport("shutil").getMember("unpack_archive").getACall().getArg(0)
99+
or
100+
// A sink capturing method calls to `extractall` without `members` argument.
101+
// For a call to `file.extractall` without `members` argument, `file` is considered a sink.
102+
exists(MethodCallNode call, AllTarfileOpens atfo |
103+
call = atfo.getReturn().getMember("extractall").getACall() and
104+
not exists(call.getArgByName("members")) and
105+
sink = call.getObject()
106+
)
107+
or
108+
// A sink capturing method calls to `extractall` with `members` argument.
109+
// For a call to `file.extractall` with `members` argument, `file` is considered a sink if not
110+
// a the `members` argument contains a NameConstant as None, a List or call to the method `getmembers`.
111+
// Otherwise, the argument of `members` is considered a sink.
112+
exists(MethodCallNode call, Node arg, AllTarfileOpens atfo |
113+
call = atfo.getReturn().getMember("extractall").getACall() and
114+
arg = call.getArgByName("members") and
115+
if
116+
arg.asCfgNode() instanceof NameConstantNode or
117+
arg.asCfgNode() instanceof ListNode
118+
then sink = call.getObject()
119+
else
120+
if arg.(MethodCallNode).getMethodName() = "getmembers"
121+
then sink = arg.(MethodCallNode).getObject()
122+
else sink = call.getArgByName("members")
123+
)
124+
or
125+
// An argument to `extract` is considered a sink.
126+
exists(AllTarfileOpens atfo |
127+
sink = atfo.getReturn().getMember("extract").getACall().getArg(0)
128+
)
129+
or
130+
//An argument to `_extract_member` is considered a sink.
131+
exists(MethodCallNode call, AllTarfileOpens atfo |
132+
call = atfo.getReturn().getMember("_extract_member").getACall() and
133+
call.getArg(1).(AttrRead).accesses(sink, "name")
134+
)
135+
) and
136+
not sink.getScope().getLocation().getFile().inStdlib()
137+
}
138+
139+
override predicate isAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
140+
// Reading the response
141+
nodeTo.(MethodCallNode).calls(nodeFrom, "read")
142+
or
143+
// Open a file for access
144+
exists(MethodCallNode cn |
145+
cn.calls(nodeTo, "open") and
146+
cn.flowsTo(nodeFrom)
147+
)
148+
or
149+
// Open a file for access using builtin
150+
exists(API::CallNode cn |
151+
cn = API::builtin("open").getACall() and
152+
nodeTo = cn.getArg(0) and
153+
cn.flowsTo(nodeFrom)
154+
)
155+
or
156+
// Write access
157+
exists(MethodCallNode cn |
158+
cn.calls(nodeTo, "write") and
159+
nodeFrom = cn.getArg(0)
160+
)
161+
or
162+
// Retrieve Django uploaded files
163+
// see getlist(): https://docs.djangoproject.com/en/4.1/ref/request-response/#django.http.QueryDict.getlist
164+
// see chunks(): https://docs.djangoproject.com/en/4.1/ref/files/uploads/#django.core.files.uploadedfile.UploadedFile.chunks
165+
nodeTo.(MethodCallNode).calls(nodeFrom, ["getlist", "get", "chunks"])
166+
or
167+
// Considering the use of "fs"
168+
// see fs: https://docs.djangoproject.com/en/4.1/ref/files/storage/#the-filesystemstorage-class
169+
nodeTo =
170+
API::moduleImport("django")
171+
.getMember("core")
172+
.getMember("files")
173+
.getMember("storage")
174+
.getMember("FileSystemStorage")
175+
.getReturn()
176+
.getMember(["save", "path"])
177+
.getACall() and
178+
nodeFrom = nodeTo.(MethodCallNode).getArg(0)
179+
or
180+
// Accessing the name or raw content
181+
nodeTo.(AttrRead).accesses(nodeFrom, ["name", "raw"])
182+
or
183+
// Join the base_dir to the filename
184+
nodeTo = API::moduleImport("os").getMember("path").getMember("join").getACall() and
185+
nodeFrom = nodeTo.(API::CallNode).getArg(1)
186+
or
187+
// Go through an Open for a Tarfile
188+
nodeTo = tarfileOpen().getACall() and nodeFrom = nodeTo.(MethodCallNode).getArg(0)
189+
or
190+
// Handle the case where the getmembers is used.
191+
nodeTo.(MethodCallNode).calls(nodeFrom, "getmembers") and
192+
nodeFrom instanceof AllTarfileOpens
193+
or
194+
// To handle the case of `with closing(tarfile.open()) as file:`
195+
// we add a step from the first argument of `closing` to the call to `closing`,
196+
// whenever that first argument is a return of `tarfile.open()`.
197+
nodeTo = API::moduleImport("contextlib").getMember("closing").getACall() and
198+
nodeFrom = nodeTo.(API::CallNode).getArg(0) and
199+
nodeFrom = tarfileOpen().getReturn().getAValueReachableFromSource()
200+
or
201+
// see Path : https://docs.python.org/3/library/pathlib.html#pathlib.Path
202+
nodeTo = API::moduleImport("pathlib").getMember("Path").getACall() and
203+
nodeFrom = nodeTo.(API::CallNode).getArg(0)
204+
or
205+
// Use of absolutepath
206+
// see absolute : https://docs.python.org/3/library/pathlib.html#pathlib.Path.absolute
207+
exists(API::CallNode mcn |
208+
mcn = API::moduleImport("pathlib").getMember("Path").getACall() and
209+
nodeTo = mcn.getAMethodCall("absolute") and
210+
nodeFrom = mcn.getArg(0)
211+
)
212+
}
213+
}

0 commit comments

Comments
 (0)