1+ import shutil
12import hashlib
23import pathlib
34import functools
45import urllib .request
56from typing import List
67
8+ from .os import chdir
9+
710
811class HashValidationError (Exception ):
912 """
@@ -34,10 +37,16 @@ def __str__(self):
3437 return f"Protocol of URL { self .url !r} is not in allowlist: { self .allowlist !r} "
3538
3639
40+ # Default list of URL protocols allowed
3741DEFAULT_PROTOCOL_ALLOWLIST : List [str ] = ["https://" ]
3842
3943
4044def sync_urlopen (url , protocol_allowlist = DEFAULT_PROTOCOL_ALLOWLIST ):
45+ """
46+ Check that ``url`` has a protocol defined in ``protocol_allowlist``, then
47+ return the result of calling :py:func:`urllib.request.urlopen` passing it
48+ ``url``.
49+ """
4150 allowed_protocol = False
4251 for protocol in protocol_allowlist :
4352 if url .startswith (protocol ):
@@ -54,6 +63,24 @@ def cached_download(
5463 expected_hash ,
5564 protocol_allowlist = DEFAULT_PROTOCOL_ALLOWLIST ,
5665):
66+ """
67+ Download a file and verify the hash of the downloaded file. If the file
68+ already exists and the hash matches, do not re-download the file.
69+
70+ Examples
71+ --------
72+
73+ >>> @cached_download(
74+ ... "https://github.com/intel/dffml/raw/152c2b92535fac6beec419236f8639b0d75d707d/MANIFEST.in",
75+ ... "MANIFEST.in",
76+ ... "f7aadf5cdcf39f161a779b4fa77ec56a49630cf7680e21fb3dc6c36ce2d8c6fae0d03d5d3094a6aec4fea1561393c14c",
77+ ... )
78+ ... async def first_line_in_manifest_152c2b(manifest):
79+ ... return manifest.read_text().split()[:2]
80+ >>>
81+ >>> asyncio.run(first_line_in_manifest_152c2b())
82+ ['include', 'README.md']
83+ """
5784 target_path = pathlib .Path (target_path )
5885
5986 def validate_hash (error : bool = True ):
@@ -69,7 +96,7 @@ def validate_hash(error: bool = True):
6996 def mkwrapper (func ):
7097 @functools .wraps (func )
7198 async def wrapper (* args , ** kwds ):
72- args = list (args ) + [str ( target_path ) ]
99+ args = list (args ) + [target_path ]
73100 if not target_path .is_file () or not validate_hash (error = False ):
74101 # TODO(p5) Blocking request in coroutine
75102 with sync_urlopen (
@@ -82,3 +109,60 @@ async def wrapper(*args, **kwds):
82109 return wrapper
83110
84111 return mkwrapper
112+
113+
114+ def cached_download_unpack_archive (
115+ url ,
116+ file_path ,
117+ directory_path ,
118+ expected_hash ,
119+ protocol_allowlist = DEFAULT_PROTOCOL_ALLOWLIST ,
120+ ):
121+ """
122+ Download an archive and extract it to a directory on disk.
123+
124+ Verify the hash of the downloaded file. If the hash matches the file is not
125+ re-downloaded.
126+
127+ .. warning::
128+
129+ This function does not verify the integrity of the unpacked archive on
130+ disk. Only the downloaded file.
131+
132+ Examples
133+ --------
134+
135+ >>> @cached_download_unpack_archive(
136+ ... "https://github.com/intel/dffml/archive/152c2b92535fac6beec419236f8639b0d75d707d.tar.gz",
137+ ... "dffml.tar.gz",
138+ ... "dffml",
139+ ... "32ba082cd8056ff4ddcb68691a590c3cb8fea2ff75c0265b8d844c5edc7eaef54136160c6090750e562059b957355b15",
140+ ... )
141+ ... async def files_in_dffml_commit_152c2b(dffml_dir):
142+ ... return len(list(dffml_dir.rglob("**/*")))
143+ >>>
144+ >>> asyncio.run(files_in_dffml_commit_152c2b())
145+ 594
146+ """
147+ directory_path = pathlib .Path (directory_path )
148+
149+ async def extractor (download_path ):
150+ download_path = download_path .absolute ()
151+ with chdir (directory_path ):
152+ shutil .unpack_archive (str (download_path ), "." )
153+
154+ extract = cached_download (
155+ url , file_path , expected_hash , protocol_allowlist = protocol_allowlist ,
156+ )(extractor )
157+
158+ def mkwrapper (func ):
159+ @functools .wraps (func )
160+ async def wrapper (* args , ** kwds ):
161+ if not directory_path .is_dir ():
162+ directory_path .mkdir (parents = True )
163+ await extract ()
164+ return await func (* (list (args ) + [directory_path ]), ** kwds )
165+
166+ return wrapper
167+
168+ return mkwrapper
0 commit comments