Skip to content

Commit c34c448

Browse files
committed
add vendored remote_files.smk
1 parent eef001c commit c34c448

File tree

1 file changed

+159
-0
lines changed

1 file changed

+159
-0
lines changed

ingest/rules/remote_files.smk

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""
2+
Helper functions to set-up storage plugins for remote inputs/outputs. See the
3+
docstring of `path_or_url` for usage instructions.
4+
5+
The errors raised by storage plugins are often confusing. For instance, a HTTP
6+
404 error will result in a `MissingInputException` with little hint as to the
7+
underlying issue. S3 credentials errors are similarly confusing and we attempt
8+
to check these ourselves to improve UX here.
9+
"""
10+
11+
from urllib.parse import urlparse
12+
13+
# Keep a list of known public buckets, which we'll allow uncredentialled (unsigned) access to
14+
# We could make this config-definable in the future
15+
PUBLIC_BUCKETS = set(['nextstrain-data'])
16+
17+
# Keep track of registered storage plugins to enable reuse
18+
_storage_registry = {}
19+
20+
class RemoteFilesMissingCredentials(Exception):
21+
pass
22+
23+
def _storage_s3(*, bucket, keep_local, retries) -> snakemake.storage.StorageProviderProxy:
24+
"""
25+
Registers and returns an instance of snakemake-storage-plugin-s3. Typically AWS
26+
credentials are required for _any_ request however we allow requests to known
27+
public buckets (see `PUBLIC_BUCKETS`) to be unsigned which allows for a nice user
28+
experience in the common case of downloading inputs from s3://nextstrain-data.
29+
30+
The intended behaviour for various (S3) URIs supplied to `path_or_url` is:
31+
32+
| | S3 buckets | credentials present | credentials missing |
33+
|----------|----------------------------|---------------------|---------------------|
34+
| download | private / private + public | signed | Credentials Error |
35+
| | public | signed | unsigned |
36+
| upload | private / private + public | signed | Credentials Error |
37+
| | public | signed | AccessDenied Error |
38+
"""
39+
# If the bucket is public then we may use an unsigned request which has the nice UX
40+
# of not needing credentials to be present. If we've made other signed requests _or_
41+
# credentials are present then we just sign everything. This has implications for upload:
42+
# if you attempt to upload to a public bucket without credentials then we allow that here
43+
# and you'll get a subsequent `AccessDenied` error when the upload is attempted.
44+
if bucket in PUBLIC_BUCKETS and \
45+
"s3_signed" not in _storage_registry and \
46+
("s3_unsigned" in _storage_registry or not _aws_credentials_present()):
47+
48+
if provider:=_storage_registry.get('s3_unsigned', None):
49+
return provider
50+
51+
from botocore import UNSIGNED # dependency of snakemake-storage-plugin-s3
52+
storage s3_unsigned:
53+
provider="s3",
54+
signature_version=UNSIGNED,
55+
retries=retries,
56+
keep_local=keep_local,
57+
58+
_storage_registry['s3_unsigned'] = storage.s3_unsigned
59+
return _storage_registry['s3_unsigned']
60+
61+
# Resource fetched/uploaded via a signed request, which will require AWS credentials
62+
if provider:=_storage_registry.get('s3_signed', None):
63+
return provider
64+
65+
# Enforce the presence of credentials to paper over <https://github.com/snakemake/snakemake/issues/3663>
66+
if not _aws_credentials_present():
67+
raise RemoteFilesMissingCredentials()
68+
69+
# the tag appears in the local file path, so reference 'signed' to give a hint about credential errors
70+
storage s3_signed:
71+
provider="s3",
72+
retries=retries,
73+
keep_local=keep_local,
74+
75+
_storage_registry['s3_signed'] = storage.s3_signed
76+
return _storage_registry['s3_signed']
77+
78+
def _aws_credentials_present() -> bool:
79+
import boto3 # dependency of snakemake-storage-plugin-s3
80+
session = boto3.Session()
81+
creds = session.get_credentials()
82+
return creds is not None
83+
84+
def _storage_http(*, keep_local, retries) -> snakemake.storage.StorageProviderProxy:
85+
"""
86+
Registers and returns an instance of snakemake-storage-plugin-http
87+
"""
88+
if provider:=_storage_registry.get('http', None):
89+
return provider
90+
91+
storage:
92+
provider="http",
93+
allow_redirects=True,
94+
supports_head=True,
95+
keep_local=keep_local,
96+
retries=retries,
97+
98+
_storage_registry['http'] = storage.http
99+
return _storage_registry['http']
100+
101+
102+
def path_or_url(uri, *, keep_local=True, retries=2) -> str:
103+
"""
104+
Intended for use in Snakemake inputs / outputs to transparently use remote
105+
resources. Returns the URI wrapped by an applicable storage plugin. Local
106+
filepaths will be returned unchanged.
107+
108+
For example, the following rule will download inputs from HTTPs and upload
109+
the output to S3:
110+
111+
rule filter:
112+
input:
113+
sequences = path_or_url("https://data.nextstrain.org/..."),
114+
metadata = path_or_url("https://data.nextstrain.org/..."),
115+
output:
116+
sequences = path_or_url("s3://...")
117+
shell:
118+
r'''
119+
augur filter \
120+
--sequences {input.sequences:q} \
121+
--metadata {input.metadata:q} \
122+
--metadata-id-columns accession \
123+
--output-sequences {output.sequences:q}
124+
'''
125+
126+
If *keep_local* is True (the default) then downloaded/uploaded files will
127+
remain in `.snakemake/storage/`. The presence of a previously downloaded
128+
file (via `keep_local=True`) does not guarantee that the file will not be
129+
re-downloaded if the storage plugin decides the local file is out of date.
130+
131+
Depending on the *uri* authentication may be required. See the specific
132+
helper functions (such as `_storage_s3`) for more details.
133+
134+
See <https://snakemake.readthedocs.io/en/stable/snakefiles/storage.html> for
135+
more information on Snakemake storage plugins. Note: various snakemake
136+
plugins will be required depending on the URIs provided.
137+
"""
138+
info = urlparse(uri)
139+
140+
if info.scheme=='': # local
141+
return uri # no storage wrapper
142+
143+
if info.scheme=='s3':
144+
try:
145+
return _storage_s3(bucket=info.netloc, keep_local=keep_local, retries=retries)(uri)
146+
except RemoteFilesMissingCredentials as e:
147+
raise Exception(f"AWS credentials are required to access {uri!r}") from e
148+
149+
if info.scheme=='https':
150+
return _storage_http(keep_local=keep_local, retries=retries)(uri)
151+
elif info.scheme=='http':
152+
raise Exception(f"HTTP remote file support is not implemented in nextstrain workflows (attempting to access {uri!r}).\n"
153+
"Please use an HTTPS address instead.")
154+
155+
if info.scheme in ['gs', 'gcs']:
156+
raise Exception(f"Google Storage is not yet implemented for nextstrain workflows (attempting to access {uri!r}).\n"
157+
"Please get in touch if you require this functionality and we can add it to our workflows")
158+
159+
raise Exception(f"Input address {uri!r} (scheme={info.scheme!r}) is from a non-supported remote")

0 commit comments

Comments
 (0)