Skip to content

Commit cacf9d9

Browse files
committed
Implement a whitelist mechanism to validate fetch entries
By default only allow HTTP(S) and (S)FTP URL
1 parent 91f5f4d commit cacf9d9

File tree

3 files changed

+65
-50
lines changed

3 files changed

+65
-50
lines changed

bagit.py

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import tempfile
1717
import unicodedata
1818
import warnings
19+
from fnmatch import fnmatch
1920
from collections import defaultdict
2021
from datetime import date
2122
from functools import partial
@@ -127,6 +128,7 @@ def find_locale_dir():
127128

128129
CHECKSUM_ALGOS = hashlib.algorithms_guaranteed
129130
DEFAULT_CHECKSUMS = ["sha256", "sha512"]
131+
DEFAULT_FETCH_URL_WHITELIST = ["https://*", "http://*", "ftp://*", "sftp://"]
130132

131133
#: Block size used when reading files for hashing:
132134
HASH_BLOCK_SIZE = 512 * 1024
@@ -140,7 +142,7 @@ def find_locale_dir():
140142

141143

142144
def make_bag(
143-
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
145+
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", fetch_url_whitelist=None
144146
):
145147
"""
146148
Convert a given directory into a bag. You can pass in arbitrary
@@ -278,7 +280,7 @@ class Bag(object):
278280
valid_files = ["bagit.txt", "fetch.txt"]
279281
valid_directories = ["data"]
280282

281-
def __init__(self, path=None):
283+
def __init__(self, path=None, fetch_url_whitelist=None):
282284
super(Bag, self).__init__()
283285
self.tags = {}
284286
self.info = {}
@@ -299,6 +301,7 @@ def __init__(self, path=None):
299301
self.normalized_manifest_names = {}
300302

301303
self.algorithms = []
304+
self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
302305
self.tag_file_name = None
303306
self.path = abspath(path)
304307
if path:
@@ -582,7 +585,7 @@ def files_to_be_fetched(self):
582585
local filename
583586
"""
584587

585-
for url, file_size, filename in self.fetch_entries():
588+
for _, _, filename in self.fetch_entries():
586589
yield filename
587590

588591
def fetch_files_to_be_fetched(self):
@@ -593,20 +596,23 @@ def fetch_files_to_be_fetched(self):
593596
opener = build_opener(proxy_handler)
594597
user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info)
595598
for url, expected_size, filename in self.fetch_entries():
596-
expected_size = int(expected_size) # FIXME should be int in the first place
599+
if not fnmatch_any(url, self.fetch_url_whitelist):
600+
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))
601+
expected_size = -1 if expected_size == '-' else int(expected_size)
597602
if filename in self.payload_files():
598603
LOGGER.info(_("File already fetched: %s"), filename)
599604
continue
600605
req = Request(url)
601606
req.add_header('User-Agent', user_agent)
602607
resp = opener.open(req)
603608
headers = resp.info()
604-
if "content-length" not in headers:
605-
LOGGER.warning(_("Server sent no content-length for <%s>"), url)
606-
else:
607-
content_length = int(headers['content-length'])
608-
if content_length != expected_size:
609-
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
609+
if expected_size >= 0:
610+
if "content-length" not in headers:
611+
LOGGER.warning(_("Server sent no content-length for <%s>"), url)
612+
else:
613+
content_length = int(headers['content-length'])
614+
if content_length != expected_size:
615+
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
610616
with open(join(self.path, filename), 'wb') as out:
611617
read = 0
612618
while True:
@@ -615,7 +621,7 @@ def fetch_files_to_be_fetched(self):
615621
break
616622
read += len(block)
617623
out.write(block)
618-
if read != expected_size:
624+
if expected_size >= 0 and read != expected_size:
619625
raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read))
620626
LOGGER.info(_("Fetched %s from %s"), filename, url)
621627

@@ -799,15 +805,10 @@ def validate_fetch(self):
799805
Raises `BagError` for errors and otherwise returns no value
800806
"""
801807

802-
for url, file_size, filename in self.fetch_entries():
803-
# fetch_entries will raise a BagError for unsafe filenames
804-
# so at this point we will check only that the URL is minimally
805-
# well formed:
806-
parsed_url = urlparse(url)
807-
808-
# ensure url is a remote URL, not file://
809-
if not all((parsed_url.scheme, parsed_url.netloc)):
810-
raise BagError(_("Malformed URL in fetch.txt: %s") % url)
808+
for url, expected_size, filename in self.fetch_entries():
809+
# ensure url matches one of the allowed patterns
810+
if not fnmatch_any(url, self.fetch_url_whitelist):
811+
raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist))
811812

812813
def _validate_contents(self, processes=1, fast=False, completeness_only=False):
813814
if fast and not self.has_oxum():
@@ -1450,6 +1451,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
14501451

14511452
return results
14521453

1454+
# Return true if any of the pattern fnmatches a string
1455+
def fnmatch_any(s, pats):
1456+
for pat in pats:
1457+
if fnmatch(s, pat):
1458+
return True
1459+
return False
14531460

14541461
def _encode_filename(s):
14551462
s = s.replace("\r", "%0D")

locale/bagit-python.pot

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ msgid ""
88
msgstr ""
99
"Project-Id-Version: PACKAGE VERSION\n"
1010
"Report-Msgid-Bugs-To: \n"
11-
"POT-Creation-Date: 2018-11-27 15:21+0100\n"
11+
"POT-Creation-Date: 2018-12-03 12:06+0100\n"
1212
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
1313
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1414
"Language-Team: LANGUAGE <[email protected]>\n"
@@ -168,6 +168,10 @@ msgstr ""
168168
msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe"
169169
msgstr ""
170170

171+
#, python-format
172+
msgid "Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s"
173+
msgstr ""
174+
171175
#, python-format
172176
msgid "File already fetched: %s"
173177
msgstr ""
@@ -225,10 +229,6 @@ msgstr ""
225229
msgid "Expected %s to contain \"bagit.txt\""
226230
msgstr ""
227231

228-
#, python-format
229-
msgid "Malformed URL in fetch.txt: %s"
230-
msgstr ""
231-
232232
msgid "Fast validation requires bag-info.txt to include Payload-Oxum"
233233
msgstr ""
234234

test.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,33 +1081,41 @@ def test_fetch_unsafe_payloads(self):
10811081

10821082
self.assertEqual(expected_msg, str(cm.exception))
10831083

1084-
def test_fetch_malformed_url(self):
1085-
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1086-
print(
1087-
"//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg",
1088-
file=fetch_txt,
1089-
)
1090-
1091-
self.bag.save(manifests=True)
1092-
1093-
expected_msg = (
1094-
"Malformed URL in fetch.txt: //photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg"
1095-
)
1096-
1097-
with self.assertRaises(bagit.BagError) as cm:
1084+
def test_invalid_urls(self):
1085+
invalid_urls = [
1086+
'//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg',
1087+
'file://%s' % j(self.tmpdir, "mock_data"),
1088+
'../../../../../etc/passwd',
1089+
]
1090+
for url in invalid_urls:
1091+
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1092+
print("%s - data/mock_data" % url, file=fetch_txt)
1093+
with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
1094+
self.bag.validate_fetch()
1095+
1096+
def test_invalid_urls_whitelist(self):
1097+
self.bag.fetch_url_whitelist = [
1098+
'https://my.inst.edu/data/*.png'
1099+
]
1100+
valid_urls = [
1101+
'https://my.inst.edu/data/foo.png'
1102+
]
1103+
invalid_urls = [
1104+
'https://my.inst.edu/data/foo'
1105+
'https://my.inst.edu/robots.txt',
1106+
'http://my.inst.edu/data/foo',
1107+
'https://example.org',
1108+
]
1109+
for url in invalid_urls:
1110+
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1111+
print("%s - data/mock_data" % url, file=fetch_txt)
1112+
with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url):
1113+
self.bag.validate_fetch()
1114+
for url in valid_urls:
1115+
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1116+
print("%s - data/mock_data" % url, file=fetch_txt)
10981117
self.bag.validate_fetch()
10991118

1100-
self.assertEqual(expected_msg, str(cm.exception))
1101-
1102-
# FIXME: Won't work since file:// URLs are rejected
1103-
# def test_fetching_payload_file(self):
1104-
# with open(j(self.tmpdir, "mock_data"), "w") as mock_data:
1105-
# print("Lorem ipsum dolor sit", file=mock_data)
1106-
# with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1107-
# print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt)
1108-
# self.bag.save(manifests=True)
1109-
# self.bag.validate_fetch()
1110-
11111119
def test_fetching_payload_file(self):
11121120
test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
11131121
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:

0 commit comments

Comments
 (0)