|
30 | 30 | from .credentials import GoogleCredentials |
31 | 31 | from .inventory_report import InventoryReport |
32 | 32 | from .retry import errs, retry_request, validate_response |
| 33 | +from .zb_hns_utils import DEFAULT_CONCURRENCY |
33 | 34 |
|
34 | 35 | logger = logging.getLogger("gcsfs") |
35 | 36 |
|
@@ -299,6 +300,7 @@ class GCSFileSystem(asyn.AsyncFileSystem): |
299 | 300 | default_block_size = DEFAULT_BLOCK_SIZE |
300 | 301 | protocol = "gs", "gcs" |
301 | 302 | async_impl = True |
| 303 | + MIN_CHUNK_SIZE_FOR_CONCURRENCY = 5 * 1024 * 1024 |
302 | 304 |
|
303 | 305 | def __init__( |
304 | 306 | self, |
@@ -1166,22 +1168,75 @@ def url(self, path): |
1166 | 1168 | f"&generation={generation}" if generation else "", |
1167 | 1169 | ) |
1168 | 1170 |
|
1169 | | - async def _cat_file(self, path, start=None, end=None, **kwargs): |
| 1171 | + async def _cat_file_sequential(self, path, start=None, end=None, **kwargs): |
1170 | 1172 | """Simple one-shot get of file data""" |
1171 | 1173 | # if start and end are both provided and valid, but start >= end, return empty bytes |
1172 | 1174 | # Otherwise, _process_limits would generate an invalid HTTP range (e.g. "bytes=5-4" |
1173 | 1175 | # for start=5, end=5), causing the server to return the whole file instead of nothing. |
1174 | 1176 | if start is not None and end is not None and start >= end >= 0: |
1175 | 1177 | return b"" |
| 1178 | + |
1176 | 1179 | u2 = self.url(path) |
1177 | | - # 'if start or end' fails when start=0 or end=0 because 0 is Falsey. |
1178 | 1180 | if start is not None or end is not None: |
1179 | 1181 | head = {"Range": await self._process_limits(path, start, end)} |
1180 | 1182 | else: |
1181 | 1183 | head = {} |
| 1184 | + |
1182 | 1185 | headers, out = await self._call("GET", u2, headers=head) |
1183 | 1186 | return out |
1184 | 1187 |
|
| 1188 | + async def _cat_file_concurrent( |
| 1189 | + self, path, start=None, end=None, concurrency=DEFAULT_CONCURRENCY, **kwargs |
| 1190 | + ): |
| 1191 | + """Concurrent fetch of file data""" |
| 1192 | + if start is None: |
| 1193 | + start = 0 |
| 1194 | + if end is None: |
| 1195 | + end = (await self._info(path))["size"] |
| 1196 | + if start >= end: |
| 1197 | + return b"" |
| 1198 | + |
| 1199 | + if concurrency <= 1 or end - start < self.MIN_CHUNK_SIZE_FOR_CONCURRENCY: |
| 1200 | + return await self._cat_file_sequential(path, start=start, end=end, **kwargs) |
| 1201 | + |
| 1202 | + total_size = end - start |
| 1203 | + part_size = total_size // concurrency |
| 1204 | + tasks = [] |
| 1205 | + |
| 1206 | + for i in range(concurrency): |
| 1207 | + offset = start + (i * part_size) |
| 1208 | + actual_size = ( |
| 1209 | + part_size if i < concurrency - 1 else total_size - (i * part_size) |
| 1210 | + ) |
| 1211 | + tasks.append( |
| 1212 | + asyncio.create_task( |
| 1213 | + self._cat_file_sequential( |
| 1214 | + path, start=offset, end=offset + actual_size, **kwargs |
| 1215 | + ) |
| 1216 | + ) |
| 1217 | + ) |
| 1218 | + |
| 1219 | + try: |
| 1220 | + results = await asyncio.gather(*tasks) |
| 1221 | + return b"".join(results) |
| 1222 | + except BaseException as e: |
| 1223 | + for t in tasks: |
| 1224 | + if not t.done(): |
| 1225 | + t.cancel() |
| 1226 | + await asyncio.gather(*tasks, return_exceptions=True) |
| 1227 | + raise e |
| 1228 | + |
| 1229 | + async def _cat_file( |
| 1230 | + self, path, start=None, end=None, concurrency=DEFAULT_CONCURRENCY, **kwargs |
| 1231 | + ): |
| 1232 | + """Simple one-shot, or concurrent get of file data""" |
| 1233 | + if concurrency > 1: |
| 1234 | + return await self._cat_file_concurrent( |
| 1235 | + path, start=start, end=end, concurrency=concurrency, **kwargs |
| 1236 | + ) |
| 1237 | + |
| 1238 | + return await self._cat_file_sequential(path, start=start, end=end, **kwargs) |
| 1239 | + |
1185 | 1240 | async def _getxattr(self, path, attr): |
1186 | 1241 | """Get user-defined metadata attribute""" |
1187 | 1242 | meta = (await self._info(path)).get("metadata", {}) |
@@ -2020,6 +2075,30 @@ def __init__( |
2020 | 2075 | self.acl = acl |
2021 | 2076 | self.consistency = consistency |
2022 | 2077 | self.checker = get_consistency_checker(consistency) |
| 2078 | + |
| 2079 | + # Ideally, all of these fields should be part of `cache_options`. Because current |
| 2080 | + # `fsspec` caches do not accept arbitrary `*args` and `**kwargs`, passing them |
| 2081 | + # there currently causes instantiation errors. We are holding off on introducing |
| 2082 | + # them as explicit keyword arguments to ensure existing user workloads are not |
| 2083 | + # disrupted. This will be refactored once the upstream `fsspec` changes are merged. |
| 2084 | + use_prefetch_reader = kwargs.get( |
| 2085 | + "use_experimental_adaptive_prefetching", False |
| 2086 | + ) or os.environ.get("use_experimental_adaptive_prefetching", False) |
| 2087 | + self.concurrency = kwargs.get("concurrency", DEFAULT_CONCURRENCY) |
| 2088 | + |
| 2089 | + if use_prefetch_reader: |
| 2090 | + max_prefetch_size = kwargs.get("max_prefetch_size", None) |
| 2091 | + from .prefetcher import BackgroundPrefetcher |
| 2092 | + |
| 2093 | + self._prefetch_engine = BackgroundPrefetcher( |
| 2094 | + self._async_fetch_range, |
| 2095 | + self.size, |
| 2096 | + max_prefetch_size=max_prefetch_size, |
| 2097 | + concurrency=self.concurrency, |
| 2098 | + ) |
| 2099 | + else: |
| 2100 | + self._prefetch_engine = None |
| 2101 | + |
2023 | 2102 | # _supports_append is an internal argument not meant to be used directly. |
2024 | 2103 | # If True, allows opening file in append mode. This is generally not supported |
2025 | 2104 | # by GCS, but may be supported by subclasses (e.g. ZonalFile). This flag should |
@@ -2202,12 +2281,30 @@ def _fetch_range(self, start=None, end=None): |
2202 | 2281 | if not both None, fetch only given range |
2203 | 2282 | """ |
2204 | 2283 | try: |
2205 | | - return self.gcsfs.cat_file(self.path, start=start, end=end) |
| 2284 | + if self._prefetch_engine: |
| 2285 | + return self._prefetch_engine._fetch(start=start, end=end) |
| 2286 | + return self.gcsfs.cat_file( |
| 2287 | + self.path, start=start, end=end, concurrency=self.concurrency |
| 2288 | + ) |
2206 | 2289 | except RuntimeError as e: |
2207 | 2290 | if "not satisfiable" in str(e): |
2208 | 2291 | return b"" |
2209 | 2292 | raise |
2210 | 2293 |
|
| 2294 | + async def _async_fetch_range(self, start_offset, total_size, split_factor=1): |
| 2295 | + """Async fetcher mapped to the Prefetcher engine for regional buckets.""" |
| 2296 | + return await self.gcsfs._cat_file_concurrent( |
| 2297 | + self.path, |
| 2298 | + start=start_offset, |
| 2299 | + end=start_offset + total_size, |
| 2300 | + concurrency=split_factor, |
| 2301 | + ) |
| 2302 | + |
| 2303 | + def close(self): |
| 2304 | + if hasattr(self, "_prefetch_engine") and self._prefetch_engine: |
| 2305 | + self._prefetch_engine.close() |
| 2306 | + super().close() |
| 2307 | + |
2211 | 2308 |
|
2212 | 2309 | def _convert_fixed_key_metadata(metadata, *, from_google=False): |
2213 | 2310 | """ |
|
0 commit comments