-
Notifications
You must be signed in to change notification settings - Fork 172
Add prefetcher reader for standard buckets. #795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ | |
| from .credentials import GoogleCredentials | ||
| from .inventory_report import InventoryReport | ||
| from .retry import errs, retry_request, validate_response | ||
| from .zb_hns_utils import DEFAULT_CONCURRENCY | ||
|
|
||
| logger = logging.getLogger("gcsfs") | ||
|
|
||
|
|
@@ -299,6 +300,7 @@ class GCSFileSystem(asyn.AsyncFileSystem): | |
| default_block_size = DEFAULT_BLOCK_SIZE | ||
| protocol = "gs", "gcs" | ||
| async_impl = True | ||
| MIN_CHUNK_SIZE_FOR_CONCURRENCY = 5 * 1024 * 1024 | ||
|
|
||
| def __init__( | ||
| self, | ||
|
|
@@ -1166,22 +1168,75 @@ def url(self, path): | |
| f"&generation={generation}" if generation else "", | ||
| ) | ||
|
|
||
| async def _cat_file(self, path, start=None, end=None, **kwargs): | ||
| async def _cat_file_sequential(self, path, start=None, end=None, **kwargs): | ||
| """Simple one-shot get of file data""" | ||
| # if start and end are both provided and valid, but start >= end, return empty bytes | ||
| # Otherwise, _process_limits would generate an invalid HTTP range (e.g. "bytes=5-4" | ||
| # for start=5, end=5), causing the server to return the whole file instead of nothing. | ||
| if start is not None and end is not None and start >= end >= 0: | ||
| return b"" | ||
|
|
||
| u2 = self.url(path) | ||
| # 'if start or end' fails when start=0 or end=0 because 0 is Falsey. | ||
| if start is not None or end is not None: | ||
| head = {"Range": await self._process_limits(path, start, end)} | ||
| else: | ||
| head = {} | ||
|
|
||
| headers, out = await self._call("GET", u2, headers=head) | ||
| return out | ||
|
|
||
| async def _cat_file_concurrent( | ||
| self, path, start=None, end=None, concurrency=DEFAULT_CONCURRENCY, **kwargs | ||
| ): | ||
| """Concurrent fetch of file data""" | ||
| if start is None: | ||
| start = 0 | ||
| if end is None: | ||
| end = (await self._info(path))["size"] | ||
| if start >= end: | ||
| return b"" | ||
|
|
||
| if concurrency <= 1 or end - start < self.MIN_CHUNK_SIZE_FOR_CONCURRENCY: | ||
| return await self._cat_file_sequential(path, start=start, end=end, **kwargs) | ||
|
|
||
| total_size = end - start | ||
| part_size = total_size // concurrency | ||
| tasks = [] | ||
|
|
||
| for i in range(concurrency): | ||
| offset = start + (i * part_size) | ||
| actual_size = ( | ||
| part_size if i < concurrency - 1 else total_size - (i * part_size) | ||
| ) | ||
| tasks.append( | ||
| asyncio.create_task( | ||
| self._cat_file_sequential( | ||
| path, start=offset, end=offset + actual_size, **kwargs | ||
| ) | ||
| ) | ||
| ) | ||
|
|
||
| try: | ||
| results = await asyncio.gather(*tasks) | ||
| return b"".join(results) | ||
| except BaseException as e: | ||
| for t in tasks: | ||
| if not t.done(): | ||
| t.cancel() | ||
| await asyncio.gather(*tasks, return_exceptions=True) | ||
| raise e | ||
|
|
||
| async def _cat_file( | ||
| self, path, start=None, end=None, concurrency=DEFAULT_CONCURRENCY, **kwargs | ||
| ): | ||
| """Simple one-shot, or concurrent get of file data""" | ||
| if concurrency > 1: | ||
| return await self._cat_file_concurrent( | ||
| path, start=start, end=end, concurrency=concurrency, **kwargs | ||
| ) | ||
|
|
||
| return await self._cat_file_sequential(path, start=start, end=end, **kwargs) | ||
|
|
||
| async def _getxattr(self, path, attr): | ||
| """Get user-defined metadata attribute""" | ||
| meta = (await self._info(path)).get("metadata", {}) | ||
|
|
@@ -2020,6 +2075,38 @@ def __init__( | |
| self.acl = acl | ||
| self.consistency = consistency | ||
| self.checker = get_consistency_checker(consistency) | ||
|
|
||
| # Ideally, all of these fields should be part of `cache_options`. Because current | ||
| # `fsspec` caches do not accept arbitrary `*args` and `**kwargs`, passing them | ||
| # there currently causes instantiation errors. We are holding off on introducing | ||
| # them as explicit keyword arguments to ensure existing user workloads are not | ||
| # disrupted. This will be refactored once the upstream `fsspec` changes are merged. | ||
| use_prefetch_reader = kwargs.get( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's only do env variable for flag and not kwargs |
||
| "use_experimental_adaptive_prefetching", False | ||
| ) or os.environ.get( | ||
| "use_experimental_adaptive_prefetching", "false" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's capitalize this, to be consistent with other env variable naming convention
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can someone please explain why we cannot use normal cache_type= and cache_options= alone rather than having to invent a set of new environment variables (not to mention the extra kwargs)? |
||
| ).lower() in ( | ||
| "true", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "true", "1" should be fine here |
||
| "1", | ||
| "t", | ||
| "y", | ||
| "yes", | ||
| ) | ||
| self.concurrency = kwargs.get("concurrency", DEFAULT_CONCURRENCY) | ||
|
|
||
| if use_prefetch_reader: | ||
| max_prefetch_size = kwargs.get("max_prefetch_size", None) | ||
| from .prefetcher import BackgroundPrefetcher | ||
|
|
||
| self._prefetch_engine = BackgroundPrefetcher( | ||
| self._async_fetch_range, | ||
| self.size, | ||
| max_prefetch_size=max_prefetch_size, | ||
| concurrency=self.concurrency, | ||
| ) | ||
| else: | ||
| self._prefetch_engine = None | ||
|
|
||
| # _supports_append is an internal argument not meant to be used directly. | ||
| # If True, allows opening file in append mode. This is generally not supported | ||
| # by GCS, but may be supported by subclasses (e.g. ZonalFile). This flag should | ||
|
|
@@ -2202,12 +2289,30 @@ def _fetch_range(self, start=None, end=None): | |
| if not both None, fetch only given range | ||
| """ | ||
| try: | ||
| return self.gcsfs.cat_file(self.path, start=start, end=end) | ||
| if self._prefetch_engine: | ||
| return self._prefetch_engine._fetch(start=start, end=end) | ||
| return self.gcsfs.cat_file( | ||
| self.path, start=start, end=end, concurrency=self.concurrency | ||
| ) | ||
| except RuntimeError as e: | ||
| if "not satisfiable" in str(e): | ||
| return b"" | ||
| raise | ||
|
|
||
| async def _async_fetch_range(self, start_offset, total_size, split_factor=1): | ||
| """Async fetcher mapped to the Prefetcher engine for regional buckets.""" | ||
| return await self.gcsfs._cat_file_concurrent( | ||
| self.path, | ||
| start=start_offset, | ||
| end=start_offset + total_size, | ||
| concurrency=split_factor, | ||
| ) | ||
|
|
||
| def close(self): | ||
| if hasattr(self, "_prefetch_engine") and self._prefetch_engine: | ||
| self._prefetch_engine.close() | ||
| super().close() | ||
|
|
||
|
|
||
| def _convert_fixed_key_metadata(metadata, *, from_google=False): | ||
| """ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.