1616import tempfile
1717import unicodedata
1818import warnings
19+ from fnmatch import fnmatch
1920from collections import defaultdict
2021from datetime import date
2122from functools import partial
@@ -127,6 +128,7 @@ def find_locale_dir():
127128
128129CHECKSUM_ALGOS = hashlib .algorithms_guaranteed
129130DEFAULT_CHECKSUMS = ["sha256" , "sha512" ]
131+ DEFAULT_FETCH_URL_WHITELIST = ["https://*" , "http://*" , "ftp://*" , "sftp://" ]
130132
131133#: Block size used when reading files for hashing:
132134HASH_BLOCK_SIZE = 512 * 1024
@@ -140,7 +142,7 @@ def find_locale_dir():
140142
141143
142144def make_bag (
143- bag_dir , bag_info = None , processes = 1 , checksums = None , checksum = None , encoding = "utf-8"
145+ bag_dir , bag_info = None , processes = 1 , checksums = None , checksum = None , encoding = "utf-8" , fetch_url_whitelist = None
144146):
145147 """
146148 Convert a given directory into a bag. You can pass in arbitrary
@@ -278,7 +280,7 @@ class Bag(object):
278280 valid_files = ["bagit.txt" , "fetch.txt" ]
279281 valid_directories = ["data" ]
280282
281- def __init__ (self , path = None ):
283+ def __init__ (self , path = None , fetch_url_whitelist = None ):
282284 super (Bag , self ).__init__ ()
283285 self .tags = {}
284286 self .info = {}
@@ -299,6 +301,7 @@ def __init__(self, path=None):
299301 self .normalized_manifest_names = {}
300302
301303 self .algorithms = []
304+ self .fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist
302305 self .tag_file_name = None
303306 self .path = abspath (path )
304307 if path :
@@ -582,7 +585,7 @@ def files_to_be_fetched(self):
582585 local filename
583586 """
584587
585- for url , file_size , filename in self .fetch_entries ():
588+ for _ , _ , filename in self .fetch_entries ():
586589 yield filename
587590
588591 def fetch_files_to_be_fetched (self ):
@@ -593,20 +596,23 @@ def fetch_files_to_be_fetched(self):
593596 opener = build_opener (proxy_handler )
594597 user_agent = "bagit.py/%s (Python/%s)" % (VERSION , sys .version_info )
595598 for url , expected_size , filename in self .fetch_entries ():
596- expected_size = int (expected_size ) # FIXME should be int in the first place
599+ if not fnmatch_any (url , self .fetch_url_whitelist ):
600+ raise BagError (_ ("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s" ) % (url , self .fetch_url_whitelist ))
601+ expected_size = - 1 if expected_size == '-' else int (expected_size )
597602 if filename in self .payload_files ():
598603 LOGGER .info (_ ("File already fetched: %s" ), filename )
599604 continue
600605 req = Request (url )
601606 req .add_header ('User-Agent' , user_agent )
602607 resp = opener .open (req )
603608 headers = resp .info ()
604- if "content-length" not in headers :
605- LOGGER .warning (_ ("Server sent no content-length for <%s>" ), url )
606- else :
607- content_length = int (headers ['content-length' ])
608- if content_length != expected_size :
609- raise BagError (_ ("Inconsistent size of %s: Expected %s but Content-Length is %s" ) % (filename , expected_size , content_length ))
609+ if expected_size >= 0 :
610+ if "content-length" not in headers :
611+ LOGGER .warning (_ ("Server sent no content-length for <%s>" ), url )
612+ else :
613+ content_length = int (headers ['content-length' ])
614+ if content_length != expected_size :
615+ raise BagError (_ ("Inconsistent size of %s: Expected %s but Content-Length is %s" ) % (filename , expected_size , content_length ))
610616 with open (join (self .path , filename ), 'wb' ) as out :
611617 read = 0
612618 while True :
@@ -615,7 +621,7 @@ def fetch_files_to_be_fetched(self):
615621 break
616622 read += len (block )
617623 out .write (block )
618- if read != expected_size :
624+ if expected_size >= 0 and read != expected_size :
619625 raise BagError (_ ("Inconsistent size of %s: Expected %s but received %s" ) % (filename , expected_size , read ))
620626 LOGGER .info (_ ("Fetched %s from %s" ), filename , url )
621627
@@ -799,15 +805,10 @@ def validate_fetch(self):
799805 Raises `BagError` for errors and otherwise returns no value
800806 """
801807
802- for url , file_size , filename in self .fetch_entries ():
803- # fetch_entries will raise a BagError for unsafe filenames
804- # so at this point we will check only that the URL is minimally
805- # well formed:
806- parsed_url = urlparse (url )
807-
808- # ensure url is a remote URL, not file://
809- if not all ((parsed_url .scheme , parsed_url .netloc )):
810- raise BagError (_ ("Malformed URL in fetch.txt: %s" ) % url )
808+ for url , expected_size , filename in self .fetch_entries ():
809+ # ensure url matches one of the allowed patterns
810+ if not fnmatch_any (url , self .fetch_url_whitelist ):
811+ raise BagError (_ ("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s" ) % (url , self .fetch_url_whitelist ))
811812
812813 def _validate_contents (self , processes = 1 , fast = False , completeness_only = False ):
813814 if fast and not self .has_oxum ():
@@ -1450,6 +1451,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
14501451
14511452 return results
14521453
1454+ # Return true if any of the pattern fnmatches a string
1455+ def fnmatch_any (s , pats ):
1456+ for pat in pats :
1457+ if fnmatch (s , pat ):
1458+ return True
1459+ return False
14531460
14541461def _encode_filename (s ):
14551462 s = s .replace ("\r " , "%0D" )
0 commit comments