|
| 1 | +""" |
| 2 | +Helper functions to set-up storage plugins for remote inputs/outputs. See the |
| 3 | +docstring of `path_or_url` for usage instructions. |
| 4 | +
|
| 5 | +The errors raised by storage plugins are often confusing. For instance, a HTTP |
| 6 | +404 error will result in a `MissingInputException` with little hint as to the |
| 7 | +underlying issue. S3 credentials errors are similarly confusing and we attempt |
| 8 | +to check these ourselves to improve UX here. |
| 9 | +""" |
| 10 | + |
| 11 | +from urllib.parse import urlparse |
| 12 | + |
| 13 | +# Keep a list of known public buckets, which we'll allow uncredentialled (unsigned) access to |
| 14 | +# We could make this config-definable in the future |
| 15 | +PUBLIC_BUCKETS = set(['nextstrain-data']) |
| 16 | + |
| 17 | +# Keep track of registered storage plugins to enable reuse |
| 18 | +_storage_registry = {} |
| 19 | + |
| 20 | +class RemoteFilesMissingCredentials(Exception): |
| 21 | + pass |
| 22 | + |
| 23 | +def _storage_s3(*, bucket, keep_local, retries) -> snakemake.storage.StorageProviderProxy: |
| 24 | + """ |
| 25 | + Registers and returns an instance of snakemake-storage-plugin-s3. Typically AWS |
| 26 | + credentials are required for _any_ request however we allow requests to known |
| 27 | + public buckets (see `PUBLIC_BUCKETS`) to be unsigned which allows for a nice user |
| 28 | + experience in the common case of downloading inputs from s3://nextstrain-data. |
| 29 | +
|
| 30 | + The intended behaviour for various (S3) URIs supplied to `path_or_url` is: |
| 31 | +
|
| 32 | + | | S3 buckets | credentials present | credentials missing | |
| 33 | + |----------|----------------------------|---------------------|---------------------| |
| 34 | + | download | private / private + public | signed | Credentials Error | |
| 35 | + | | public | signed | unsigned | |
| 36 | + | upload | private / private + public | signed | Credentials Error | |
| 37 | + | | public | signed | AccessDenied Error | |
| 38 | + """ |
| 39 | + # If the bucket is public then we may use an unsigned request which has the nice UX |
| 40 | + # of not needing credentials to be present. If we've made other signed requests _or_ |
| 41 | + # credentials are present then we just sign everything. This has implications for upload: |
| 42 | + # if you attempt to upload to a public bucket without credentials then we allow that here |
| 43 | + # and you'll get a subsequent `AccessDenied` error when the upload is attempted. |
| 44 | + if bucket in PUBLIC_BUCKETS and \ |
| 45 | + "s3_signed" not in _storage_registry and \ |
| 46 | + ("s3_unsigned" in _storage_registry or not _aws_credentials_present()): |
| 47 | + |
| 48 | + if provider:=_storage_registry.get('s3_unsigned', None): |
| 49 | + return provider |
| 50 | + |
| 51 | + from botocore import UNSIGNED # dependency of snakemake-storage-plugin-s3 |
| 52 | + storage s3_unsigned: |
| 53 | + provider="s3", |
| 54 | + signature_version=UNSIGNED, |
| 55 | + retries=retries, |
| 56 | + keep_local=keep_local, |
| 57 | + |
| 58 | + _storage_registry['s3_unsigned'] = storage.s3_unsigned |
| 59 | + return _storage_registry['s3_unsigned'] |
| 60 | + |
| 61 | + # Resource fetched/uploaded via a signed request, which will require AWS credentials |
| 62 | + if provider:=_storage_registry.get('s3_signed', None): |
| 63 | + return provider |
| 64 | + |
| 65 | + # Enforce the presence of credentials to paper over <https://github.com/snakemake/snakemake/issues/3663> |
| 66 | + if not _aws_credentials_present(): |
| 67 | + raise RemoteFilesMissingCredentials() |
| 68 | + |
| 69 | + # the tag appears in the local file path, so reference 'signed' to give a hint about credential errors |
| 70 | + storage s3_signed: |
| 71 | + provider="s3", |
| 72 | + retries=retries, |
| 73 | + keep_local=keep_local, |
| 74 | + |
| 75 | + _storage_registry['s3_signed'] = storage.s3_signed |
| 76 | + return _storage_registry['s3_signed'] |
| 77 | + |
| 78 | +def _aws_credentials_present() -> bool: |
| 79 | + import boto3 # dependency of snakemake-storage-plugin-s3 |
| 80 | + session = boto3.Session() |
| 81 | + creds = session.get_credentials() |
| 82 | + return creds is not None |
| 83 | + |
| 84 | +def _storage_http(*, keep_local, retries) -> snakemake.storage.StorageProviderProxy: |
| 85 | + """ |
| 86 | + Registers and returns an instance of snakemake-storage-plugin-http |
| 87 | + """ |
| 88 | + if provider:=_storage_registry.get('http', None): |
| 89 | + return provider |
| 90 | + |
| 91 | + storage: |
| 92 | + provider="http", |
| 93 | + allow_redirects=True, |
| 94 | + supports_head=True, |
| 95 | + keep_local=keep_local, |
| 96 | + retries=retries, |
| 97 | + |
| 98 | + _storage_registry['http'] = storage.http |
| 99 | + return _storage_registry['http'] |
| 100 | + |
| 101 | + |
| 102 | +def path_or_url(uri, *, keep_local=True, retries=2) -> str: |
| 103 | + """ |
| 104 | + Intended for use in Snakemake inputs / outputs to transparently use remote |
| 105 | + resources. Returns the URI wrapped by an applicable storage plugin. Local |
| 106 | + filepaths will be returned unchanged. |
| 107 | +
|
| 108 | + For example, the following rule will download inputs from HTTPs and upload |
| 109 | + the output to S3: |
| 110 | +
|
| 111 | + rule filter: |
| 112 | + input: |
| 113 | + sequences = path_or_url("https://data.nextstrain.org/..."), |
| 114 | + metadata = path_or_url("https://data.nextstrain.org/..."), |
| 115 | + output: |
| 116 | + sequences = path_or_url("s3://...") |
| 117 | + shell: |
| 118 | + r''' |
| 119 | + augur filter \ |
| 120 | + --sequences {input.sequences:q} \ |
| 121 | + --metadata {input.metadata:q} \ |
| 122 | + --metadata-id-columns accession \ |
| 123 | + --output-sequences {output.sequences:q} |
| 124 | + ''' |
| 125 | +
|
| 126 | + If *keep_local* is True (the default) then downloaded/uploaded files will |
| 127 | + remain in `.snakemake/storage/`. The presence of a previously downloaded |
| 128 | + file (via `keep_local=True`) does not guarantee that the file will not be |
| 129 | + re-downloaded if the storage plugin decides the local file is out of date. |
| 130 | +
|
| 131 | + Depending on the *uri* authentication may be required. See the specific |
| 132 | + helper functions (such as `_storage_s3`) for more details. |
| 133 | +
|
| 134 | + See <https://snakemake.readthedocs.io/en/stable/snakefiles/storage.html> for |
| 135 | + more information on Snakemake storage plugins. Note: various snakemake |
| 136 | + plugins will be required depending on the URIs provided. |
| 137 | + """ |
| 138 | + info = urlparse(uri) |
| 139 | + |
| 140 | + if info.scheme=='': # local |
| 141 | + return uri # no storage wrapper |
| 142 | + |
| 143 | + if info.scheme=='s3': |
| 144 | + try: |
| 145 | + return _storage_s3(bucket=info.netloc, keep_local=keep_local, retries=retries)(uri) |
| 146 | + except RemoteFilesMissingCredentials as e: |
| 147 | + raise Exception(f"AWS credentials are required to access {uri!r}") from e |
| 148 | + |
| 149 | + if info.scheme=='https': |
| 150 | + return _storage_http(keep_local=keep_local, retries=retries)(uri) |
| 151 | + elif info.scheme=='http': |
| 152 | + raise Exception(f"HTTP remote file support is not implemented in nextstrain workflows (attempting to access {uri!r}).\n" |
| 153 | + "Please use an HTTPS address instead.") |
| 154 | + |
| 155 | + if info.scheme in ['gs', 'gcs']: |
| 156 | + raise Exception(f"Google Storage is not yet implemented for nextstrain workflows (attempting to access {uri!r}).\n" |
| 157 | + "Please get in touch if you require this functionality and we can add it to our workflows") |
| 158 | + |
| 159 | + raise Exception(f"Input address {uri!r} (scheme={info.scheme!r}) is from a non-supported remote") |
0 commit comments