Skip to content

Commit e991f07

Browse files
committed
Add an experimental GCSDownloader component
This is an implementation of HTTPS download from GCS collections via Location header sniffing, which is the only mechanism we have available on all GCS versions for translating a URL to a client ID.
1 parent f9cd908 commit e991f07

File tree

5 files changed

+427
-0
lines changed

5 files changed

+427
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Added
2+
-----
3+
4+
- Added a new ``globus_sdk.experimental.gcs_downloader`` module which provides
5+
experimental tooling for downloading files from HTTPS GCS Collections. The
6+
module provides helper utilities and a central ``GCSDownloader`` class for
7+
detecting requirements and downloading files. (:pr:`NUMBER`)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
.. _gcs_downloader:
2+
3+
.. currentmodule:: globus_sdk.experimental.gcs_downloader
4+
5+
GCS Downloader
6+
==============
7+
8+
A :class:`GCSDownloader` is an object which handles connections to an
9+
HTTPS-enabled collection and single file downloads over HTTPS.
10+
11+
It primarily features two APIs:
12+
13+
1. Initialization and use as a context manager
14+
2. :meth:`GCSDownloader.read_file` to get a single file by URL
15+
16+
.. autoclass:: GCSDownloader
17+
:members:
18+
:member-order: bysource
19+
20+
.. autoclass:: HTTPSClientConstructor
21+
:members:
22+
:member-order: bysource
23+
24+
.. autoclass:: GCSCollectionHTTPSClient
25+
:members:
26+
:member-order: bysource
27+
28+
Example Usage
29+
-------------
30+
31+
.. code-block:: python
32+
33+
import argparse
34+
35+
parser = argparse.ArgumentParser()
36+
parser.add_argument(
37+
"--url",
38+
# example value is used as a default
39+
default=(
40+
"https://m-d3a2c3.collection1.tutorials.globus.org"
41+
"/home/share/godata/file2.txt"
42+
),
43+
)
44+
args = parser.parse_args()
45+
46+
# SDK Tutorial Client ID - <replace this with your own client>
47+
client_id = "61338d24-54d5-408f-a10d-66c06b59f6d2"
48+
with globus_sdk.UserApp("gcs-downloader-demo", client_id=client_id) as app:
49+
with GCSDownloader(app) as downloader:
50+
print(downloader.read_file(args.url))

docs/experimental/index.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ Globus SDK Experimental Components
1010

1111
**Use at your own risk.**
1212

13+
.. toctree::
14+
:caption: Experimental Constructs
15+
:maxdepth: 1
16+
17+
gcs_downloader
1318

1419
Experimental Construct Lifecycle
1520
--------------------------------
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
"""
2+
The GCSDownloader provides HTTPS file download capabilities for Globus Connect Server.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import logging
8+
import sys
9+
import types
10+
import typing as t
11+
import urllib.parse
12+
13+
import globus_sdk
14+
import globus_sdk.scopes
15+
import globus_sdk.transport
16+
from globus_sdk._internal.classprop import classproperty
17+
from globus_sdk._internal.type_definitions import Closable
18+
from globus_sdk.authorizers import GlobusAuthorizer
19+
from globus_sdk.transport.default_retry_checks import DEFAULT_RETRY_CHECKS
20+
21+
if sys.version_info >= (3, 11):
22+
from typing import Self
23+
else:
24+
from typing_extensions import Self
25+
26+
log = logging.getLogger(__name__)
27+
28+
29+
class HTTPSClientConstructor(t.Protocol):
30+
"""A protocol which defines the factory type used to customize a GCSDownloader."""
31+
32+
def __call__(
33+
self,
34+
*,
35+
collection_client_id: str,
36+
default_scope_requirements: t.Iterable[globus_sdk.Scope],
37+
base_url: str,
38+
) -> GCSCollectionHTTPSClient: ...
39+
40+
41+
class GCSDownloader:
42+
"""
43+
An object which manages connection and authentication state to enable HTTPS
44+
downloads from a specific Globus Connect Server.
45+
46+
The initial request to read a file features support for determining authentication
47+
requirements dynamically, and subsequent requests will reuse that authentication
48+
data.
49+
50+
Using a single :class:`GCSDownloader` to access distinct collections is not
51+
supported. A separate downloader should be used for each collection.
52+
53+
Downloaders may be used as context managers, in which case they automatically call
54+
their ``close()`` method on exit:
55+
56+
>>> with GCSDownloader(app) as downloader:
57+
>>> print(downloader.read_file(url))
58+
59+
:param app: The :class:`GlobusApp` used to authenticate calls to this server.
60+
:param https_client: The underlying client used for the file read request. Typically
61+
omitted. When not provided, one will be constructed on demand by the downloader.
62+
As an alternative to providing a client, a callable factory may be passed here,
63+
which will be given the ``collection_client_id``,
64+
``default_scope_requirements``, and ``base_url`` and must return a new client.
65+
:param transfer_client: A client used when detecting collection information.
66+
Typically omitted. When not provided, one will be constructed on demand by the
67+
downloader.
68+
:param transport: A transport for the downloader, used for authentication
69+
sniffing operations. When a client is built by the downloader it will
70+
inherit this transport.
71+
"""
72+
73+
def __init__(
74+
self,
75+
app: globus_sdk.GlobusApp,
76+
*,
77+
https_client: GCSCollectionHTTPSClient | HTTPSClientConstructor | None = None,
78+
transfer_client: globus_sdk.TransferClient | None = None,
79+
transport: globus_sdk.transport.RequestsTransport | None = None,
80+
) -> None:
81+
self.app = app
82+
self._resources_to_close: list[Closable] = []
83+
84+
if transport is not None:
85+
self.transport = transport
86+
else:
87+
self.transport = globus_sdk.transport.RequestsTransport()
88+
self._resources_to_close.append(self.transport)
89+
90+
# the downloader will need a RetryConfig when it uses its own transport
91+
self._retry_config = globus_sdk.transport.RetryConfig()
92+
self._retry_config.checks.register_many_checks(DEFAULT_RETRY_CHECKS)
93+
94+
# three essential cases for https_client:
95+
# 1. default, setup the default client factory method
96+
if https_client is None:
97+
self.https_client: GCSCollectionHTTPSClient | None = None
98+
self._https_client_constructor: HTTPSClientConstructor = (
99+
self._default_https_client_constructor
100+
)
101+
# 2. concrete client, store (default factory is set for type safety, but will
102+
# not be used)
103+
elif isinstance(https_client, GCSCollectionHTTPSClient):
104+
self.https_client = https_client
105+
self._https_client_constructor = self._default_https_client_constructor
106+
# 3. factory method, store it and no client
107+
else:
108+
self.https_client = None
109+
self._https_client_constructor = https_client
110+
111+
# set the transfer_client if provided
112+
self.transfer_client = transfer_client
113+
114+
def __enter__(self) -> Self:
115+
return self
116+
117+
def __exit__(
118+
self,
119+
exc_type: type[BaseException] | None,
120+
exc_val: BaseException | None,
121+
exc_tb: types.TracebackType | None,
122+
) -> None:
123+
self.close()
124+
125+
def close(self) -> None:
126+
"""
127+
Close all resources which are owned by this downloader.
128+
"""
129+
for resource in self._resources_to_close:
130+
log.debug(
131+
f"closing resource of type {type(resource).__name__} "
132+
f"for {type(self).__name__}"
133+
)
134+
resource.close()
135+
136+
@t.overload
137+
def read_file(self, file_uri: str, *, as_text: t.Literal[True]) -> str: ...
138+
@t.overload
139+
def read_file(self, file_uri: str, *, as_text: t.Literal[False]) -> bytes: ...
140+
@t.overload
141+
def read_file(self, file_uri: str) -> str: ...
142+
143+
def read_file(self, file_uri: str, *, as_text: bool = True) -> str | bytes:
144+
"""
145+
Given a file URI on a GCS Collection, read the data.
146+
147+
:param file_uri: The full URI of the file on the collection which is being
148+
downloaded.
149+
:param as_text: When ``True``, the file contents are decoded into a string. Set
150+
to ``False`` to retrieve data as bytes.
151+
152+
.. caution::
153+
154+
The file read is done naively as a GET request. This may be unsuitable for
155+
very large files.
156+
"""
157+
# dynamically build a client if needed
158+
if self.https_client is None:
159+
self.https_client = self._get_client_from_uri(file_uri)
160+
self._resources_to_close.append(self.https_client)
161+
162+
response = self.https_client.get(file_uri)
163+
if as_text:
164+
return response.text
165+
return response.binary_content
166+
167+
def _get_client_from_uri(self, file_uri: str) -> GCSCollectionHTTPSClient:
168+
collection_id = self._sniff_collection_id(file_uri)
169+
scopes = self._detect_scopes(collection_id)
170+
base_url = _get_base_url(file_uri)
171+
return self._https_client_constructor(
172+
collection_client_id=collection_id,
173+
default_scope_requirements=scopes,
174+
base_url=base_url,
175+
)
176+
177+
def _default_https_client_constructor(
178+
self,
179+
*,
180+
collection_client_id: str,
181+
default_scope_requirements: t.Iterable[globus_sdk.Scope],
182+
base_url: str,
183+
) -> GCSCollectionHTTPSClient:
184+
return GCSCollectionHTTPSClient(
185+
collection_client_id,
186+
default_scope_requirements,
187+
app=self.app,
188+
base_url=base_url,
189+
transport=self.transport,
190+
)
191+
192+
def _detect_scopes(self, collection_id: str) -> list[globus_sdk.Scope]:
193+
if self.transfer_client is None:
194+
self.transfer_client = globus_sdk.TransferClient(
195+
app=self.app, transport=self.transport
196+
)
197+
self._resources_to_close.append(self.transfer_client)
198+
scopes = globus_sdk.scopes.GCSCollectionScopes(collection_id)
199+
if _uses_data_access(self.transfer_client, collection_id):
200+
return [scopes.https, scopes.data_access]
201+
return [scopes.https]
202+
203+
def _sniff_collection_id(self, file_uri: str) -> str:
204+
response = self.transport.request(
205+
"GET",
206+
file_uri,
207+
caller_info=globus_sdk.transport.RequestCallerInfo(
208+
retry_config=self._retry_config
209+
),
210+
allow_redirects=False,
211+
)
212+
if "Location" not in response.headers:
213+
msg = (
214+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
215+
"failed. Did not receive a redirect with Location header on "
216+
"unauthenticated call."
217+
)
218+
raise RuntimeError(msg)
219+
220+
location_header = response.headers["Location"]
221+
parsed_location = urllib.parse.urlparse(location_header)
222+
parsed_location_qs = urllib.parse.parse_qs(parsed_location.query)
223+
224+
if "client_id" not in parsed_location_qs:
225+
msg = (
226+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
227+
"failed. Location header did not encode a 'client_id'."
228+
)
229+
raise RuntimeError(msg)
230+
231+
client_ids = parsed_location_qs["client_id"]
232+
if len(client_ids) != 1:
233+
msg = (
234+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
235+
"failed. Multiple 'client_id' params were present."
236+
)
237+
raise RuntimeError(msg)
238+
239+
return client_ids[0]
240+
241+
242+
class GCSCollectionHTTPSClient(globus_sdk.BaseClient):
243+
"""
244+
A dedicated client type for an HTTPS-capable Collection used for file downloads.
245+
246+
Users should generally not instantiate this class directly, but instead rely on
247+
:class:`GCSDownloader` to properly initialize these clients.
248+
249+
.. sdk-sphinx-copy-params:: BaseClient
250+
251+
:param collection_client_id: The ID of the collection.
252+
:param default_scope_requirements: The scopes needed for HTTPS access to the
253+
collection. This should contain the `https` scope for the collection and the
254+
`data_access` scope if applicable.
255+
"""
256+
257+
def __init__(
258+
self,
259+
collection_client_id: str,
260+
default_scope_requirements: t.Iterable[globus_sdk.Scope] = (),
261+
*,
262+
environment: str | None = None,
263+
base_url: str | None = None,
264+
app: globus_sdk.GlobusApp | None = None,
265+
app_scopes: list[globus_sdk.scopes.Scope] | None = None,
266+
authorizer: GlobusAuthorizer | None = None,
267+
app_name: str | None = None,
268+
transport: globus_sdk.transport.RequestsTransport | None = None,
269+
retry_config: globus_sdk.transport.RetryConfig | None = None,
270+
) -> None:
271+
self.collection_client_id = collection_client_id
272+
self._default_scope_requirements = list(default_scope_requirements)
273+
super().__init__(
274+
environment=environment,
275+
base_url=base_url,
276+
app=app,
277+
app_scopes=app_scopes,
278+
authorizer=authorizer,
279+
app_name=app_name,
280+
transport=transport,
281+
retry_config=retry_config,
282+
)
283+
284+
@classproperty
285+
def resource_server( # pylint: disable=missing-param-doc
286+
self_or_cls: globus_sdk.BaseClient | type[globus_sdk.BaseClient],
287+
) -> str | None:
288+
"""
289+
The resource server for a GCS collection is the ID of the collection.
290+
291+
This will return None if called as a classmethod as an instantiated
292+
``GCSClient`` is required to look up the client ID from the endpoint.
293+
"""
294+
if not isinstance(self_or_cls, GCSCollectionHTTPSClient):
295+
return None
296+
297+
return self_or_cls.collection_client_id
298+
299+
@property
300+
def default_scope_requirements(self) -> list[globus_sdk.Scope]:
301+
return self._default_scope_requirements
302+
303+
304+
def _get_base_url(file_uri: str) -> str:
305+
parsed = urllib.parse.urlparse(file_uri)
306+
return f"{parsed.scheme}://{parsed.netloc}"
307+
308+
309+
def _uses_data_access(
310+
transfer_client: globus_sdk.TransferClient, collection_id: str
311+
) -> bool:
312+
doc = transfer_client.get_endpoint(collection_id)
313+
if doc["entity_type"] != "GCSv5_mapped_collection":
314+
return False
315+
if doc["high_assurance"]:
316+
return False
317+
return True

0 commit comments

Comments
 (0)