Skip to content

Commit bd6e691

Browse files
committed
Add an experimental GCSDownloader component
This is an implementation of HTTPS download from GCS collections via Location header sniffing, which is the only mechanism we have available on all GCS versions for translating a URL to a client ID.
1 parent f9cd908 commit bd6e691

File tree

5 files changed

+428
-0
lines changed

5 files changed

+428
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Added
2+
-----
3+
4+
- Added a new ``globus_sdk.experimental.gcs_downloader`` module which provides
5+
experimental tooling for downloading files from HTTPS GCS Collections. The
6+
module provides helper utilities and a central ``GCSDownloader`` class for
7+
detecting requirements and downloading files. (:pr:`NUMBER`)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
.. _gcs_downloader:
2+
3+
.. currentmodule:: globus_sdk.experimental.gcs_downloader
4+
5+
GCS Downloader
6+
==============
7+
8+
A :class:`GCSDownloader` is an object which handles connections to an
9+
HTTPS-enabled collection and single file downloads over HTTPS.
10+
11+
It primarily features two APIs:
12+
13+
1. Initialization and use as a context manager
14+
2. :meth:`GCSDownloader.read_file` to get a single file by URL
15+
16+
.. autoclass:: GCSDownloader
17+
:members:
18+
:member-order: bysource
19+
20+
.. autoclass:: HTTPSClientConstructor
21+
:members:
22+
:member-order: bysource
23+
24+
.. autoclass:: GCSCollectionHTTPSClient
25+
:members:
26+
:member-order: bysource
27+
28+
Example Usage
29+
-------------
30+
31+
.. code-block:: python
32+
33+
import argparse
34+
35+
parser = argparse.ArgumentParser()
36+
parser.add_argument(
37+
"--url",
38+
# example value is used as a default
39+
default=(
40+
"https://m-d3a2c3.collection1.tutorials.globus.org"
41+
"/home/share/godata/file2.txt"
42+
),
43+
)
44+
args = parser.parse_args()
45+
46+
# SDK Tutorial Client ID - <replace this with your own client>
47+
client_id = "61338d24-54d5-408f-a10d-66c06b59f6d2"
48+
with globus_sdk.UserApp("gcs-downloader-demo", client_id=client_id) as app:
49+
with GCSDownloader(app) as downloader:
50+
print(downloader.read_file(args.url))

docs/experimental/index.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ Globus SDK Experimental Components
1010

1111
**Use at your own risk.**
1212

13+
.. toctree::
14+
:caption: Experimental Constructs
15+
:maxdepth: 1
16+
17+
gcs_downloader
1318

1419
Experimental Construct Lifecycle
1520
--------------------------------
Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
"""
2+
The GCSDownloader provides HTTPS file download capabilities for Globus Connect Server.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import logging
8+
import sys
9+
import types
10+
import typing as t
11+
import urllib.parse
12+
13+
import globus_sdk
14+
import globus_sdk.scopes
15+
import globus_sdk.transport
16+
from globus_sdk._internal.classprop import classproperty
17+
from globus_sdk._internal.type_definitions import Closable
18+
from globus_sdk.authorizers import GlobusAuthorizer
19+
from globus_sdk.transport.default_retry_checks import DEFAULT_RETRY_CHECKS
20+
21+
if sys.version_info >= (3, 11):
22+
from typing import Self
23+
else:
24+
from typing_extensions import Self
25+
26+
log = logging.getLogger(__name__)
27+
28+
29+
class HTTPSClientConstructor(t.Protocol):
30+
"""A protocol which defines the factory type used to customize a GCSDownloader."""
31+
32+
def __call__(
33+
*,
34+
collection_client_id: str,
35+
default_scope_requirements: t.Iterable[globus_sdk.Scope],
36+
base_url: str,
37+
) -> GCSCollectionHTTPSClient: ...
38+
39+
40+
class GCSDownloader:
41+
"""
42+
An object which manages connection and authentication state to enable HTTPS
43+
downloads from a specific Globus Connect Server.
44+
45+
The initial request to read a file features support for determining authentication
46+
requirements dynamically, and subsequent requests will reuse that authentication
47+
data.
48+
49+
Using a single :class:`GCSDownloader` to access distinct collections is not
50+
supported. A separate downloader should be used for each collection.
51+
52+
Downloaders may be used as context managers, in which case they automatically call
53+
their ``close()`` method on exit:
54+
55+
>>> with GCSDownloader(app) as downloader:
56+
>>> print(downloader.read_file(url))
57+
58+
:param app: The :class:`GlobusApp` used to authenticate calls to this server.
59+
:param https_client: The underlying client used for the file read request. Typically
60+
omitted. When not provided, one will be constructed on demand by the downloader.
61+
As an alternative to providing a client, a callable factory may be passed here,
62+
which will be given the ``collection_client_id``,
63+
``default_scope_requirements``, and ``base_url`` and must return a new client.
64+
:param transfer_client: A client used when detecting collection information.
65+
Typically omitted. When not provided, one will be constructed on demand by the
66+
downloader.
67+
:param transport: A transport for the downloader, used for authentication
68+
sniffing operations. When a client is built by the downloader it will
69+
inherit this transport.
70+
"""
71+
72+
def __init__(
73+
self,
74+
app: globus_sdk.GlobusApp,
75+
*,
76+
https_client: GCSCollectionHTTPSClient | HTTPSClientConstructor | None = None,
77+
transfer_client: globus_sdk.TransferClient | None = None,
78+
transport: globus_sdk.transport.RequestsTransport | None = None,
79+
) -> None:
80+
self.app = app
81+
self._resources_to_close: list[Closable] = []
82+
83+
if transport is not None:
84+
self.transport = transport
85+
else:
86+
self.transport = globus_sdk.transport.RequestsTransport()
87+
self._resources_to_close.append(self.transport)
88+
89+
# the downloader will need a RetryConfig when it uses its own transport
90+
self._retry_config = globus_sdk.transport.RetryConfig()
91+
self._retry_config.checks.register_many_checks(DEFAULT_RETRY_CHECKS)
92+
93+
# three essential cases for https_client:
94+
# 1. default, setup the default client factory method
95+
if https_client is None:
96+
self.https_client: GCSCollectionHTTPSClient | None = None
97+
self._https_client_constructor: HTTPSClientConstructor = (
98+
self._default_https_client_constructor
99+
)
100+
# 2. concrete client, store (default factory is set for type safety, but will
101+
# not be used)
102+
elif isinstance(https_client, GCSCollectionHTTPSClient):
103+
self.https_client = https_client
104+
self._https_client_constructor: HTTPSClientConstructor = (
105+
self._default_https_client_constructor
106+
)
107+
# 3. factory method, store it and no client
108+
else:
109+
self.https_client = None
110+
self._https_client_constructor = https_client
111+
112+
# set the transfer_client if provided
113+
self.transfer_client = transfer_client
114+
115+
def __enter__(self) -> Self:
116+
return self
117+
118+
def __exit__(
119+
self,
120+
exc_type: type[BaseException] | None,
121+
exc_val: BaseException | None,
122+
exc_tb: types.TracebackType | None,
123+
) -> None:
124+
self.close()
125+
126+
def close(self) -> None:
127+
"""
128+
Close all resources which are owned by this downloader.
129+
"""
130+
for resource in self._resources_to_close:
131+
log.debug(
132+
f"closing resource of type {type(resource).__name__} "
133+
f"for {type(self).__name__}"
134+
)
135+
resource.close()
136+
137+
@t.overload
138+
def read_file(self, file_uri: str, *, as_text: t.Literal[True]) -> str: ...
139+
@t.overload
140+
def read_file(self, file_uri: str, *, as_text: t.Literal[False]) -> bytes: ...
141+
@t.overload
142+
def read_file(self, file_uri: str) -> str: ...
143+
144+
def read_file(self, file_uri: str, *, as_text: bool = True) -> str | bytes:
145+
"""
146+
Given a file URI on a GCS Collection, read the data.
147+
148+
:param file_uri: The full URI of the file on the collection which is being
149+
downloaded.
150+
:param as_text: When ``True``, the file contents are decoded into a string. Set
151+
to ``False`` to retrieve data as bytes.
152+
153+
.. caution::
154+
155+
The file read is done naively as a GET request. This may be unsuitable for
156+
very large files.
157+
"""
158+
# dynamically build a client if needed
159+
if self.https_client is None:
160+
self.https_client = self._get_client_from_uri(file_uri)
161+
self._resources_to_close.append(self.https_client)
162+
163+
response = self.https_client.get(file_uri)
164+
if as_text:
165+
return response.text
166+
return response.binary_content
167+
168+
def _get_client_from_uri(self, file_uri: str) -> GCSCollectionHTTPSClient:
169+
collection_id = self._sniff_collection_id(file_uri)
170+
scopes = self._detect_scopes(collection_id)
171+
base_url = _get_base_url(file_uri)
172+
return self._https_client_constructor(
173+
collection_client_id=collection_id,
174+
default_scope_requirements=scopes,
175+
base_url=base_url,
176+
)
177+
178+
def _default_https_client_constructor(
179+
self,
180+
*,
181+
collection_client_id: str,
182+
default_scope_requirements: t.Iterable[globus_sdk.Scope],
183+
base_url: str,
184+
) -> GCSCollectionHTTPSClient:
185+
return GCSCollectionHTTPSClient(
186+
collection_client_id,
187+
default_scope_requirements,
188+
app=self.app,
189+
base_url=base_url,
190+
transport=self.transport,
191+
)
192+
193+
def _detect_scopes(self, collection_id: str) -> list[globus_sdk.Scope]:
194+
if self.transfer_client is None:
195+
self.transfer_client = globus_sdk.TransferClient(
196+
app=self.app, transport=self.transport
197+
)
198+
self._resources_to_close.append(self.transfer_client)
199+
scopes = globus_sdk.scopes.GCSCollectionScopes(collection_id)
200+
if _uses_data_access(self.transfer_client, collection_id):
201+
return [scopes.https, scopes.data_access]
202+
return [scopes.https]
203+
204+
def _sniff_collection_id(self, file_uri: str) -> str:
205+
response = self.transport.request(
206+
"GET",
207+
file_uri,
208+
caller_info=globus_sdk.transport.RequestCallerInfo(
209+
retry_config=self._retry_config
210+
),
211+
allow_redirects=False,
212+
)
213+
if "Location" not in response.headers:
214+
msg = (
215+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
216+
"failed. Did not receive a redirect with Location header on "
217+
"unauthenticated call."
218+
)
219+
raise RuntimeError(msg)
220+
221+
location_header = response.headers["Location"]
222+
parsed_location = urllib.parse.urlparse(location_header)
223+
parsed_location_qs = urllib.parse.parse_qs(parsed_location.query)
224+
225+
if "client_id" not in parsed_location_qs:
226+
msg = (
227+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
228+
"failed. Location header did not encode a 'client_id'."
229+
)
230+
raise RuntimeError(msg)
231+
232+
client_ids = parsed_location_qs["client_id"]
233+
if len(client_ids) != 1:
234+
msg = (
235+
f"Attempting to detect the collection ID for the file at '{file_uri}' "
236+
"failed. Multiple 'client_id' params were present."
237+
)
238+
raise RuntimeError(msg)
239+
240+
return client_ids[0]
241+
242+
243+
class GCSCollectionHTTPSClient(globus_sdk.BaseClient):
244+
"""
245+
A dedicated client type for an HTTPS-capable Collection used for file downloads.
246+
247+
Users should generally not instantiate this class directly, but instead rely on
248+
:class:`GCSDownloader` to properly initialize these clients.
249+
250+
.. sdk-sphinx-copy-params:: BaseClient
251+
252+
:param collection_client_id: The ID of the collection.
253+
:param default_scope_requirements: The scopes needed for HTTPS access to the
254+
collection. This should contain the `https` scope for the collection and the
255+
`data_access` scope if applicable.
256+
"""
257+
258+
def __init__(
259+
self,
260+
collection_client_id: str,
261+
default_scope_requirements: t.Iterable[globus_sdk.Scope] = (),
262+
*,
263+
environment: str | None = None,
264+
base_url: str | None = None,
265+
app: globus_sdk.GlobusApp | None = None,
266+
app_scopes: list[globus_sdk.scopes.Scope] | None = None,
267+
authorizer: GlobusAuthorizer | None = None,
268+
app_name: str | None = None,
269+
transport: globus_sdk.transport.RequestsTransport | None = None,
270+
retry_config: globus_sdk.transport.RetryConfig | None = None,
271+
) -> None:
272+
self.collection_client_id = collection_client_id
273+
self._default_scope_requirements = list(default_scope_requirements)
274+
super().__init__(
275+
environment=environment,
276+
base_url=base_url,
277+
app=app,
278+
app_scopes=app_scopes,
279+
authorizer=authorizer,
280+
app_name=app_name,
281+
transport=transport,
282+
retry_config=retry_config,
283+
)
284+
285+
@classproperty
286+
def resource_server( # pylint: disable=missing-param-doc
287+
self_or_cls: globus_sdk.BaseClient | type[globus_sdk.BaseClient],
288+
) -> str | None:
289+
"""
290+
The resource server for a GCS collection is the ID of the collection.
291+
292+
This will return None if called as a classmethod as an instantiated
293+
``GCSClient`` is required to look up the client ID from the endpoint.
294+
"""
295+
if not isinstance(self_or_cls, GCSCollectionHTTPSClient):
296+
return None
297+
298+
return self_or_cls.collection_client_id
299+
300+
@property
301+
def default_scope_requirements(self) -> list[globus_sdk.Scope]:
302+
return self._default_scope_requirements
303+
304+
305+
def _get_base_url(file_uri: str) -> str:
306+
parsed = urllib.parse.urlparse(file_uri)
307+
return f"{parsed.scheme}://{parsed.netloc}"
308+
309+
310+
def _uses_data_access(
311+
transfer_client: globus_sdk.TransferClient, collection_id: str
312+
) -> bool:
313+
doc = transfer_client.get_endpoint(collection_id)
314+
if doc["entity_type"] != "GCSv5_mapped_collection":
315+
return False
316+
if doc["high_assurance"]:
317+
return False
318+
return True

0 commit comments

Comments
 (0)