|
| 1 | +import logging |
| 2 | +from enum import Enum |
| 3 | + |
| 4 | +from fsspec import asyn |
| 5 | +from google.api_core import exceptions as api_exceptions |
| 6 | +from google.api_core import gapic_v1 |
| 7 | +from google.api_core.client_info import ClientInfo |
| 8 | +from google.cloud import storage_control_v2 |
| 9 | +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient |
| 10 | +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( |
| 11 | + AsyncMultiRangeDownloader, |
| 12 | +) |
| 13 | + |
| 14 | +from gcsfs import __version__ as version |
| 15 | +from gcsfs import zb_hns_utils |
| 16 | +from gcsfs.core import GCSFile, GCSFileSystem |
| 17 | +from gcsfs.zonal_file import ZonalFile |
| 18 | + |
| 19 | +logger = logging.getLogger("gcsfs") |
| 20 | + |
| 21 | +USER_AGENT = "python-gcsfs" |
| 22 | + |
| 23 | + |
| 24 | +class BucketType(Enum): |
| 25 | + ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" |
| 26 | + HIERARCHICAL = "HIERARCHICAL" |
| 27 | + NON_HIERARCHICAL = "NON_HIERARCHICAL" |
| 28 | + UNKNOWN = "UNKNOWN" |
| 29 | + |
| 30 | + |
| 31 | +gcs_file_types = { |
| 32 | + BucketType.ZONAL_HIERARCHICAL: ZonalFile, |
| 33 | + BucketType.NON_HIERARCHICAL: GCSFile, |
| 34 | + BucketType.HIERARCHICAL: GCSFile, |
| 35 | + BucketType.UNKNOWN: GCSFile, |
| 36 | +} |
| 37 | + |
| 38 | + |
| 39 | +class ExtendedGcsFileSystem(GCSFileSystem): |
| 40 | + """ |
| 41 | + This class will be used when GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT env variable is set to true. |
| 42 | + ExtendedGcsFileSystem is a subclass of GCSFileSystem that adds new logic for bucket types |
| 43 | + including zonal and hierarchical. For buckets without special properties, it forwards requests |
| 44 | + to the parent class GCSFileSystem for default processing. |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__(self, *args, **kwargs): |
| 48 | + super().__init__(*args, **kwargs) |
| 49 | + self.grpc_client = None |
| 50 | + self.storage_control_client = None |
| 51 | + # initializing grpc and storage control client for Hierarchical and |
| 52 | + # zonal bucket operations |
| 53 | + self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) |
| 54 | + self._storage_control_client = asyn.sync( |
| 55 | + self.loop, self._create_control_plane_client |
| 56 | + ) |
| 57 | + self._storage_layout_cache = {} |
| 58 | + |
| 59 | + async def _create_grpc_client(self): |
| 60 | + if self.grpc_client is None: |
| 61 | + return AsyncGrpcClient( |
| 62 | + client_info=ClientInfo(user_agent=f"{USER_AGENT}/{version}"), |
| 63 | + ).grpc_client |
| 64 | + else: |
| 65 | + return self.grpc_client |
| 66 | + |
| 67 | + async def _create_control_plane_client(self): |
| 68 | + # Initialize the storage control plane client for bucket |
| 69 | + # metadata operations |
| 70 | + client_info = gapic_v1.client_info.ClientInfo( |
| 71 | + user_agent=f"{USER_AGENT}/{version}" |
| 72 | + ) |
| 73 | + return storage_control_v2.StorageControlAsyncClient( |
| 74 | + credentials=self.credentials.credentials, client_info=client_info |
| 75 | + ) |
| 76 | + |
| 77 | + async def _lookup_bucket_type(self, bucket): |
| 78 | + if bucket in self._storage_layout_cache: |
| 79 | + return self._storage_layout_cache[bucket] |
| 80 | + bucket_type = await self._get_bucket_type(bucket) |
| 81 | + # Dont cache UNKNOWN type |
| 82 | + if bucket_type == BucketType.UNKNOWN: |
| 83 | + return BucketType.UNKNOWN |
| 84 | + self._storage_layout_cache[bucket] = bucket_type |
| 85 | + return self._storage_layout_cache[bucket] |
| 86 | + |
| 87 | + _sync_lookup_bucket_type = asyn.sync_wrapper(_lookup_bucket_type) |
| 88 | + |
| 89 | + async def _get_bucket_type(self, bucket): |
| 90 | + try: |
| 91 | + bucket_name_value = f"projects/_/buckets/{bucket}/storageLayout" |
| 92 | + response = await self._storage_control_client.get_storage_layout( |
| 93 | + name=bucket_name_value |
| 94 | + ) |
| 95 | + |
| 96 | + if response.location_type == "zone": |
| 97 | + return BucketType.ZONAL_HIERARCHICAL |
| 98 | + else: |
| 99 | + # This should be updated to include HNS in the future |
| 100 | + return BucketType.NON_HIERARCHICAL |
| 101 | + except api_exceptions.NotFound: |
| 102 | + logger.warning(f"Error: Bucket {bucket} not found or you lack permissions.") |
| 103 | + return BucketType.UNKNOWN |
| 104 | + except Exception as e: |
| 105 | + logger.error( |
| 106 | + f"Could not determine bucket type for bucket name {bucket}: {e}" |
| 107 | + ) |
| 108 | + # Default to UNKNOWN in case bucket type is not obtained |
| 109 | + return BucketType.UNKNOWN |
| 110 | + |
| 111 | + def _open( |
| 112 | + self, |
| 113 | + path, |
| 114 | + mode="rb", |
| 115 | + block_size=None, |
| 116 | + cache_options=None, |
| 117 | + acl=None, |
| 118 | + consistency=None, |
| 119 | + metadata=None, |
| 120 | + autocommit=True, |
| 121 | + fixed_key_metadata=None, |
| 122 | + generation=None, |
| 123 | + **kwargs, |
| 124 | + ): |
| 125 | + """ |
| 126 | + Open a file. |
| 127 | + """ |
| 128 | + bucket, _, _ = self.split_path(path) |
| 129 | + bucket_type = self._sync_lookup_bucket_type(bucket) |
| 130 | + return gcs_file_types[bucket_type]( |
| 131 | + self, |
| 132 | + path, |
| 133 | + mode, |
| 134 | + block_size, |
| 135 | + cache_options=cache_options, |
| 136 | + consistency=consistency, |
| 137 | + metadata=metadata, |
| 138 | + acl=acl, |
| 139 | + autocommit=autocommit, |
| 140 | + fixed_key_metadata=fixed_key_metadata, |
| 141 | + generation=generation, |
| 142 | + **kwargs, |
| 143 | + ) |
| 144 | + |
| 145 | + # Replacement method for _process_limits to support new params (offset and length) for MRD. |
| 146 | + async def _process_limits_to_offset_and_length(self, path, start, end): |
| 147 | + """ |
| 148 | + Calculates the read offset and length from start and end parameters. |
| 149 | +
|
| 150 | + Args: |
| 151 | + path (str): The path to the file. |
| 152 | + start (int | None): The starting byte position. |
| 153 | + end (int | None): The ending byte position. |
| 154 | +
|
| 155 | + Returns: |
| 156 | + tuple: A tuple containing (offset, length). |
| 157 | +
|
| 158 | + Raises: |
| 159 | + ValueError: If the calculated range is invalid. |
| 160 | + """ |
| 161 | + size = None |
| 162 | + |
| 163 | + if start is None: |
| 164 | + offset = 0 |
| 165 | + elif start < 0: |
| 166 | + size = (await self._info(path))["size"] if size is None else size |
| 167 | + offset = size + start |
| 168 | + else: |
| 169 | + offset = start |
| 170 | + |
| 171 | + if end is None: |
| 172 | + size = (await self._info(path))["size"] if size is None else size |
| 173 | + effective_end = size |
| 174 | + elif end < 0: |
| 175 | + size = (await self._info(path))["size"] if size is None else size |
| 176 | + effective_end = size + end |
| 177 | + else: |
| 178 | + effective_end = end |
| 179 | + |
| 180 | + if offset < 0: |
| 181 | + raise ValueError(f"Calculated start offset ({offset}) cannot be negative.") |
| 182 | + if effective_end < offset: |
| 183 | + raise ValueError( |
| 184 | + f"Calculated end position ({effective_end}) cannot be before start offset ({offset})." |
| 185 | + ) |
| 186 | + elif effective_end == offset: |
| 187 | + length = 0 # Handle zero-length slice |
| 188 | + else: |
| 189 | + length = effective_end - offset # Normal case |
| 190 | + size = (await self._info(path))["size"] if size is None else size |
| 191 | + if effective_end > size: |
| 192 | + length = max(0, size - offset) # Clamp and ensure non-negative |
| 193 | + |
| 194 | + return offset, length |
| 195 | + |
| 196 | + sync_process_limits_to_offset_and_length = asyn.sync_wrapper( |
| 197 | + _process_limits_to_offset_and_length |
| 198 | + ) |
| 199 | + |
| 200 | + async def _is_zonal_bucket(self, bucket): |
| 201 | + bucket_type = await self._lookup_bucket_type(bucket) |
| 202 | + return bucket_type == BucketType.ZONAL_HIERARCHICAL |
| 203 | + |
| 204 | + async def _cat_file(self, path, start=None, end=None, mrd=None, **kwargs): |
| 205 | + """Fetch a file's contents as bytes, with an optimized path for Zonal buckets. |
| 206 | +
|
| 207 | + This method overrides the parent `_cat_file` to read objects in Zonal buckets using gRPC. |
| 208 | +
|
| 209 | + Args: |
| 210 | + path (str): The full GCS path to the file (e.g., "bucket/object"). |
| 211 | + start (int, optional): The starting byte position to read from. |
| 212 | + end (int, optional): The ending byte position to read to. |
| 213 | + mrd (AsyncMultiRangeDownloader, optional): An existing multi-range |
| 214 | + downloader instance. If not provided, a new one will be created for Zonal buckets. |
| 215 | +
|
| 216 | + Returns: |
| 217 | + bytes: The content of the file or file range. |
| 218 | + """ |
| 219 | + mrd = kwargs.pop("mrd", None) |
| 220 | + mrd_created = False |
| 221 | + |
| 222 | + # A new MRD is required when read is done directly by the |
| 223 | + # GCSFilesystem class without creating a GCSFile object first. |
| 224 | + if mrd is None: |
| 225 | + bucket, object_name, generation = self.split_path(path) |
| 226 | + # Fall back to default implementation if not a zonal bucket |
| 227 | + if not await self._is_zonal_bucket(bucket): |
| 228 | + return await super()._cat_file(path, start=start, end=end, **kwargs) |
| 229 | + |
| 230 | + mrd = await AsyncMultiRangeDownloader.create_mrd( |
| 231 | + self.grpc_client, bucket, object_name, generation |
| 232 | + ) |
| 233 | + mrd_created = True |
| 234 | + |
| 235 | + offset, length = await self._process_limits_to_offset_and_length( |
| 236 | + path, start, end |
| 237 | + ) |
| 238 | + try: |
| 239 | + return await zb_hns_utils.download_range( |
| 240 | + offset=offset, length=length, mrd=mrd |
| 241 | + ) |
| 242 | + finally: |
| 243 | + # Explicit cleanup if we created the MRD |
| 244 | + if mrd_created: |
| 245 | + await mrd.close() |
0 commit comments