From ecbec0e62ebd39fd3435ebd8e5d608e822878afb Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sat, 27 Sep 2025 05:08:47 +0000 Subject: [PATCH 01/59] adds logic for handling requests from new adapter is bucket type os special type --- gcsfs/core.py | 17 +++++++++++++++++ gcsfs/gcs_adapter.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 gcsfs/gcs_adapter.py diff --git a/gcsfs/core.py b/gcsfs/core.py index 91dd3d9f..ec25c48f 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -29,6 +29,7 @@ from .credentials import GoogleCredentials from .inventory_report import InventoryReport from .retry import errs, retry_request, validate_response +from .gcs_adapter import GCSAdapter logger = logging.getLogger("gcsfs") @@ -281,6 +282,14 @@ class GCSFileSystem(asyn.AsyncFileSystem): default_block_size = DEFAULT_BLOCK_SIZE protocol = "gs", "gcs" async_impl = True + bucket_type = None + _adapter = None + + @classmethod + def _get_adapter(cls, **kwargs): + if cls._adapter is None: + cls._adapter = GCSAdapter(**kwargs) + return cls._adapter def __init__( self, @@ -301,6 +310,7 @@ def __init__( endpoint_url=None, default_location=None, version_aware=False, + bucket_type=None, **kwargs, ): if cache_timeout is not None: @@ -327,6 +337,7 @@ def __init__( self.session_kwargs = session_kwargs or {} self.default_location = default_location self.version_aware = version_aware + self.bucket_type = bucket_type if check_connection: warnings.warn( @@ -456,6 +467,12 @@ def _format_path(self, path, args): async def _request( self, method, path, *args, headers=None, json=None, data=None, **kwargs ): + if self.bucket_type == "Zonal": + adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) + result = adapter.handle(method, self._format_path(path, args), **kwargs) + if result is not None: + return result + await self._set_session() if hasattr(data, "seek"): data.seek(0) diff --git a/gcsfs/gcs_adapter.py b/gcsfs/gcs_adapter.py new file mode 100644 index 00000000..f62e865b --- /dev/null +++ b/gcsfs/gcs_adapter.py @@ -0,0 +1,41 @@ +from google.cloud import storage +import re + +class GCSAdapter: + def __init__(self, project=None, token=None): + self.client = storage.Client(project=project, credentials=token) + + def _parse_path(self, path): + match = re.match(r"https://storage.googleapis.com/storage/v1/b/([^/]+)/o/([^?]+)", path) + if match: + return match.groups() + return None, None + + def read_object(self, path, **kwargs): + bucket_name, blob_name = self._parse_path(path) + if not bucket_name or not blob_name: + raise ValueError(f"Invalid GCS path: {path}") + + bucket = self.client.bucket(bucket_name) + blob = bucket.blob(blob_name) + + # Assuming the 'read' operation is for downloading the object's content + try: + content = blob.download_as_bytes() + # The original _request method returns a tuple, so we mimic that structure + # (status, headers, info, contents) + return 200, {}, {}, content + except Exception as e: + # Mimic a basic error response + return 404, {}, {}, str(e).encode() + + def handle(self, method, path, **kwargs): + # Check if the request is a read operation. + # A simple check is to see if the method is 'GET' and if it's a media download link. + is_read_request = method.upper() == 'GET' and "alt=media" in path + + if is_read_request: + return self.read_object(path, **kwargs) + + # If it's not a read request this adapter should handle, do nothing. + return None \ No newline at end of file From d506a60e76fb43be3a7e2f6ef48a5613ae3dff7a Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 1 Oct 2025 06:20:31 +0000 Subject: [PATCH 02/59] adds logic for hnadling requests from new adapter is bucket type is zonal --- gcsfs/core.py | 39 +++++++++++++++++++++++-- gcsfs/tests/test_core.py | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index ec25c48f..0db9e432 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -13,6 +13,7 @@ import warnings import weakref from datetime import datetime, timedelta +from enum import Enum from urllib.parse import parse_qs from urllib.parse import quote as quote_urllib from urllib.parse import urlsplit @@ -30,6 +31,7 @@ from .inventory_report import InventoryReport from .retry import errs, retry_request, validate_response from .gcs_adapter import GCSAdapter +from enum import Enum logger = logging.getLogger("gcsfs") @@ -68,7 +70,6 @@ "custom_time": "customTime", } - def quote(s): """ Quote characters to be safe for URL paths. @@ -282,6 +283,12 @@ class GCSFileSystem(asyn.AsyncFileSystem): default_block_size = DEFAULT_BLOCK_SIZE protocol = "gs", "gcs" async_impl = True + + class BucketType(Enum): + ZONAL = "ZONAL" + REGIONAL = "REGIONAL" + UNKNOWN = "UNKNOWN" + bucket_type = None _adapter = None @@ -338,6 +345,13 @@ def __init__( self.default_location = default_location self.version_aware = version_aware self.bucket_type = bucket_type + self._storage_layout_cache = {} + + if project and project != DEFAULT_PROJECT: + try: + self.bucket_type = self._get_storage_layout(project) + except Exception as e: + logger.warning(f"Failed to get storage layout for bucket {project}: {e}") if check_connection: warnings.warn( @@ -349,6 +363,27 @@ def __init__( project, access, token, on_google=self.on_google ) + async def _get_storage_layout(self, bucket): + if bucket in self._storage_layout_cache: + return self._storage_layout_cache[bucket] + + try: + response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) + if response.get("locationType") == "zone": + bucket_type = self.BucketType.ZONAL + else: + # This should be updated to include HNS in the future + bucket_type = self.BucketType.REGIONAL + self._storage_layout_cache[bucket] = bucket_type + return bucket_type + except Exception as e: + logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") + # Default to UNKNOWN + self._storage_layout_cache[bucket] = self.BucketType.UNKNOWN + return self.BucketType.UNKNOWN + + _get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + @property def _location(self): return self._endpoint or _location() @@ -467,7 +502,7 @@ def _format_path(self, path, args): async def _request( self, method, path, *args, headers=None, json=None, data=None, **kwargs ): - if self.bucket_type == "Zonal": + if self.bucket_type == self.BucketType.ZONAL: adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) result = adapter.handle(method, self._format_path(path, args), **kwargs) if result is not None: diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index f3cc3870..ef4735ef 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1702,3 +1702,64 @@ def test_near_find(gcs): def test_get_error(gcs): with pytest.raises(FileNotFoundError): gcs.get_file(f"{TEST_BUCKET}/doesnotexist", "other") + + +def test_storage_layout_zonal(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: + # Mock the storage layout response for a Zonal bucket + mock_get_layout.return_value = GCSFileSystem.BucketType.ZONAL + + gcs = gcs_factory(project=TEST_BUCKET) + + # Verify that _get_storage_layout was called + mock_get_layout.assert_called_with(TEST_BUCKET) + + # Verify that the bucket_type is set to "Zonal" + assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL + + +def test_storage_layout_regional(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: + # Mock the storage layout response for a Regional bucket + mock_get_layout.return_value = GCSFileSystem.BucketType.REGIONAL + + gcs = gcs_factory(project=TEST_BUCKET) + + # Verify that _get_storage_layout was called + mock_get_layout.assert_called_with(TEST_BUCKET) + + # Verify that the bucket_type is set to "Regional/Multi-Regional" + assert gcs.bucket_type == GCSFileSystem.BucketType.REGIONAL + + +def test_adapter_called_for_zonal_bucket(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout, \ + mock.patch("gcsfs.core.GCSAdapter.handle") as mock_handle: + + mock_get_layout.return_value = GCSFileSystem.BucketType.ZONAL + mock_handle.return_value = (200, {}, {}, b"some data") + + gcs = gcs_factory(project=TEST_BUCKET) + assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL + + # This call should be routed to the gcs adapter and this test should be + # updated once read functionality is added in adapter + gcs.cat_file(f"{TEST_BUCKET}/some/file") + + mock_handle.assert_called_once() + + +def test_adapter_not_called_for_regional_bucket(gcs): + with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: + + mock_get_layout.return_value = GCSFileSystem.BucketType.REGIONAL + + # We need to set the bucket_type on the existing `gcs` fixture instance + gcs.bucket_type = GCSFileSystem.BucketType.REGIONAL + + # Use an existing file from the test setup + test_file = f"{TEST_BUCKET}/test/accounts.1.json" + + # This call should NOT be routed to the adapter + data = gcs.cat(test_file) + assert data == files["test/accounts.1.json"] From 9a70933963da0fe78b685d322bae59ce7245c73c Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Fri, 3 Oct 2025 13:27:02 +0000 Subject: [PATCH 03/59] refactor bucket type enum to include value for HNS bucket type | Fixes new tests --- gcsfs/core.py | 22 +++++++-------- gcsfs/tests/test_core.py | 59 ++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 0db9e432..d2b24e45 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -285,8 +285,9 @@ class GCSFileSystem(asyn.AsyncFileSystem): async_impl = True class BucketType(Enum): - ZONAL = "ZONAL" - REGIONAL = "REGIONAL" + ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" + HIERARCHICAL = "HIERARCHICAL" + NON_HIERARCHICAL = "NON_HIERARCHICAL" UNKNOWN = "UNKNOWN" bucket_type = None @@ -347,11 +348,10 @@ def __init__( self.bucket_type = bucket_type self._storage_layout_cache = {} - if project and project != DEFAULT_PROJECT: - try: - self.bucket_type = self._get_storage_layout(project) - except Exception as e: - logger.warning(f"Failed to get storage layout for bucket {project}: {e}") + try: + self.bucket_type = self._sync_get_storage_layout(project) + except Exception as e: + logger.warning(f"Failed to get storage layout for bucket {project}: {e}") if check_connection: warnings.warn( @@ -370,10 +370,10 @@ async def _get_storage_layout(self, bucket): try: response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) if response.get("locationType") == "zone": - bucket_type = self.BucketType.ZONAL + bucket_type = self.BucketType.ZONAL_HIERARCHICAL else: # This should be updated to include HNS in the future - bucket_type = self.BucketType.REGIONAL + bucket_type = self.BucketType.NON_HIERARCHICAL self._storage_layout_cache[bucket] = bucket_type return bucket_type except Exception as e: @@ -382,7 +382,7 @@ async def _get_storage_layout(self, bucket): self._storage_layout_cache[bucket] = self.BucketType.UNKNOWN return self.BucketType.UNKNOWN - _get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) @property def _location(self): @@ -502,7 +502,7 @@ def _format_path(self, path, args): async def _request( self, method, path, *args, headers=None, json=None, data=None, **kwargs ): - if self.bucket_type == self.BucketType.ZONAL: + if self.bucket_type == self.BucketType.ZONAL_HIERARCHICAL: adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) result = adapter.handle(method, self._format_path(path, args), **kwargs) if result is not None: diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index ef4735ef..cd4d51ad 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1705,61 +1705,50 @@ def test_get_error(gcs): def test_storage_layout_zonal(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: - # Mock the storage layout response for a Zonal bucket - mock_get_layout.return_value = GCSFileSystem.BucketType.ZONAL + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout: + gcs = gcs_factory() - gcs = gcs_factory(project=TEST_BUCKET) + # Verify that _get_storage_layout was called with the BUCKET name + mock_get_layout.assert_called_once() - # Verify that _get_storage_layout was called - mock_get_layout.assert_called_with(TEST_BUCKET) + # Verify that the bucket_type attribute on the gcs instance is set to ZONAL + assert gcs.bucket_type == gcs.BucketType.ZONAL_HIERARCHICAL - # Verify that the bucket_type is set to "Zonal" - assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL +def test_storage_layout_existing_buckets(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: + gcs = gcs_factory() -def test_storage_layout_regional(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: - # Mock the storage layout response for a Regional bucket - mock_get_layout.return_value = GCSFileSystem.BucketType.REGIONAL + # Verify that _get_storage_layout was called with the BUCKET name + mock_get_layout.assert_called_once() - gcs = gcs_factory(project=TEST_BUCKET) + # Verify that the bucket_type attribute on the gcs instance is set to ZONAL + assert gcs.bucket_type == gcs.BucketType.NON_HIERARCHICAL - # Verify that _get_storage_layout was called - mock_get_layout.assert_called_with(TEST_BUCKET) - # Verify that the bucket_type is set to "Regional/Multi-Regional" - assert gcs.bucket_type == GCSFileSystem.BucketType.REGIONAL +def test_gcs_adapter_called_for_zonal_bucket(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout, \ + mock.patch("gcsfs.core.GCSAdapter.handle", return_value=(200, {}, {}, b"some data")) as mock_handle: - -def test_adapter_called_for_zonal_bucket(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout, \ - mock.patch("gcsfs.core.GCSAdapter.handle") as mock_handle: - - mock_get_layout.return_value = GCSFileSystem.BucketType.ZONAL - mock_handle.return_value = (200, {}, {}, b"some data") - - gcs = gcs_factory(project=TEST_BUCKET) - assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL + gcs = gcs_factory() # This call should be routed to the gcs adapter and this test should be - # updated once read functionality is added in adapter + # updated with real data assertions instead of mock_handle once read functionality is added in adapter gcs.cat_file(f"{TEST_BUCKET}/some/file") + assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL mock_handle.assert_called_once() -def test_adapter_not_called_for_regional_bucket(gcs): - with mock.patch("gcsfs.core.GCSFileSystem._get_storage_layout") as mock_get_layout: - - mock_get_layout.return_value = GCSFileSystem.BucketType.REGIONAL - - # We need to set the bucket_type on the existing `gcs` fixture instance - gcs.bucket_type = GCSFileSystem.BucketType.REGIONAL +def test_gcs_adapter_not_called_when_bucket_type_is_not_set(gcs): + with mock.patch("gcsfs.core.GCSAdapter.handle", + return_value=(200, {}, {}, b"some data")) as mock_handle: + assert gcs.bucket_type == gcs.BucketType.UNKNOWN # Use an existing file from the test setup test_file = f"{TEST_BUCKET}/test/accounts.1.json" # This call should NOT be routed to the adapter data = gcs.cat(test_file) assert data == files["test/accounts.1.json"] + mock_handle.assert_not_called() From eb209f531ee2c1e793f7c5239f04c7e112fc3e17 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 6 Oct 2025 12:24:27 +0000 Subject: [PATCH 04/59] adds feature toggle behind which experimental feature support would be developed - experimental_zb_hns_support --- gcsfs/core.py | 17 +++++++----- gcsfs/tests/conftest.py | 5 ++-- gcsfs/tests/test_core.py | 59 ++++++++++++---------------------------- 3 files changed, 30 insertions(+), 51 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index d2b24e45..a110ca37 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -290,7 +290,7 @@ class BucketType(Enum): NON_HIERARCHICAL = "NON_HIERARCHICAL" UNKNOWN = "UNKNOWN" - bucket_type = None + bucket_type = "UNKNOWN" _adapter = None @classmethod @@ -318,7 +318,7 @@ def __init__( endpoint_url=None, default_location=None, version_aware=False, - bucket_type=None, + experimental_zb_hns_support=False, **kwargs, ): if cache_timeout is not None: @@ -345,13 +345,16 @@ def __init__( self.session_kwargs = session_kwargs or {} self.default_location = default_location self.version_aware = version_aware - self.bucket_type = bucket_type + self.experimental_zb_hns_support = experimental_zb_hns_support self._storage_layout_cache = {} - try: - self.bucket_type = self._sync_get_storage_layout(project) - except Exception as e: - logger.warning(f"Failed to get storage layout for bucket {project}: {e}") + if self.experimental_zb_hns_support: + try: + self.bucket_type = self._sync_get_storage_layout(project) + except Exception as e: + logger.warning( + f"Failed to get storage layout for bucket {project}: {e}" + ) if check_connection: warnings.warn( diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index e1a73732..57f605b5 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -91,10 +91,9 @@ def docker_gcs(): def gcs_factory(docker_gcs): params["endpoint_url"] = docker_gcs - def factory(default_location=None): + def factory(**kwargs): GCSFileSystem.clear_instance_cache() - params["default_location"] = default_location - return fsspec.filesystem("gcs", **params) + return fsspec.filesystem("gcs", **params, **kwargs) return factory diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index cd4d51ad..7fd74052 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1704,51 +1704,28 @@ def test_get_error(gcs): gcs.get_file(f"{TEST_BUCKET}/doesnotexist", "other") -def test_storage_layout_zonal(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout: - gcs = gcs_factory() - - # Verify that _get_storage_layout was called with the BUCKET name - mock_get_layout.assert_called_once() - - # Verify that the bucket_type attribute on the gcs instance is set to ZONAL - assert gcs.bucket_type == gcs.BucketType.ZONAL_HIERARCHICAL - - -def test_storage_layout_existing_buckets(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: - gcs = gcs_factory() - - # Verify that _get_storage_layout was called with the BUCKET name +def test_storage_layout_called_when_toggle_is_on(gcs_factory): + with mock.patch( + "gcsfs.core.GCSFileSystem._sync_get_storage_layout", + return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL, + ) as mock_get_layout: + gcs = gcs_factory(experimental_zb_hns_support=True) + + # Verify that _get_storage_layout was called mock_get_layout.assert_called_once() - # Verify that the bucket_type attribute on the gcs instance is set to ZONAL - assert gcs.bucket_type == gcs.BucketType.NON_HIERARCHICAL - - -def test_gcs_adapter_called_for_zonal_bucket(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout, \ - mock.patch("gcsfs.core.GCSAdapter.handle", return_value=(200, {}, {}, b"some data")) as mock_handle: - - gcs = gcs_factory() - - # This call should be routed to the gcs adapter and this test should be - # updated with real data assertions instead of mock_handle once read functionality is added in adapter - gcs.cat_file(f"{TEST_BUCKET}/some/file") - + # Verify that the bucket_type attribute on the gcs instance is set assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL - mock_handle.assert_called_once() -def test_gcs_adapter_not_called_when_bucket_type_is_not_set(gcs): - with mock.patch("gcsfs.core.GCSAdapter.handle", - return_value=(200, {}, {}, b"some data")) as mock_handle: +def test_storage_layout_not_called_when_toggle_is__default_off(gcs_factory): + with mock.patch( + "gcsfs.core.GCSFileSystem._sync_get_storage_layout" + ) as mock_get_layout: + gcs = gcs_factory() - assert gcs.bucket_type == gcs.BucketType.UNKNOWN - # Use an existing file from the test setup - test_file = f"{TEST_BUCKET}/test/accounts.1.json" + # Verify that _get_storage_layout was not called + mock_get_layout.assert_not_called() - # This call should NOT be routed to the adapter - data = gcs.cat(test_file) - assert data == files["test/accounts.1.json"] - mock_handle.assert_not_called() + # Verify that the bucket_type attribute on the gcs instance is UNKNOWN + assert gcs.bucket_type == "UNKNOWN" From 0b606b1995b9e2adcb50abaae0571222aaaa062c Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 6 Oct 2025 12:33:54 +0000 Subject: [PATCH 05/59] updates tests to include experimental feature toggle --- gcsfs/core.py | 2 +- gcsfs/tests/test_core.py | 60 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index a110ca37..a6d935ca 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -290,7 +290,7 @@ class BucketType(Enum): NON_HIERARCHICAL = "NON_HIERARCHICAL" UNKNOWN = "UNKNOWN" - bucket_type = "UNKNOWN" + bucket_type = BucketType.UNKNOWN _adapter = None @classmethod diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 7fd74052..be4a948f 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1718,7 +1718,7 @@ def test_storage_layout_called_when_toggle_is_on(gcs_factory): assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL -def test_storage_layout_not_called_when_toggle_is__default_off(gcs_factory): +def test_storage_layout_not_called_when_toggle_is_default_off(gcs_factory): with mock.patch( "gcsfs.core.GCSFileSystem._sync_get_storage_layout" ) as mock_get_layout: @@ -1729,3 +1729,61 @@ def test_storage_layout_not_called_when_toggle_is__default_off(gcs_factory): # Verify that the bucket_type attribute on the gcs instance is UNKNOWN assert gcs.bucket_type == "UNKNOWN" + +def test_storage_layout_zonal(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout: + gcs = gcs_factory(experimental_zb_hns_support=True) + + # Verify that _get_storage_layout was called with the BUCKET name + mock_get_layout.assert_called_once() + + # Verify that the bucket_type attribute on the gcs instance is set to ZONAL + assert gcs.bucket_type == gcs.BucketType.ZONAL_HIERARCHICAL + + +def test_default_bucket_type(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: + gcs = gcs_factory() + + # Verify that _get_storage_layout was called with the BUCKET name + mock_get_layout.assert_not_called() + + # Verify that the bucket_type attribute on the gcs instance is set to ZONAL + assert gcs.bucket_type == gcs.BucketType.UNKNOWN + +def test_non_hierarchical_bucket_type(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: + gcs = gcs_factory(experimental_zb_hns_support=True) + + # Verify that _get_storage_layout was called with the BUCKET name + mock_get_layout.assert_called_once() + + # Verify that the bucket_type attribute on the gcs instance is set to ZONAL + assert gcs.bucket_type == gcs.BucketType.NON_HIERARCHICAL + +def test_gcs_adapter_called_for_zonal_bucket(gcs_factory): + with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout, \ + mock.patch("gcsfs.core.GCSAdapter.handle", return_value=(200, {}, {}, b"some data")) as mock_handle: + + gcs = gcs_factory(experimental_zb_hns_support=True) + + # This call should be routed to the gcs adapter and this test should be + # updated with real data assertions instead of mock_handle once read functionality is added in adapter + gcs.cat_file(f"{TEST_BUCKET}/some/file") + + assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL + mock_handle.assert_called_once() + + +def test_gcs_adapter_not_called_when_bucket_type_is_not_set(gcs): + with mock.patch("gcsfs.core.GCSAdapter.handle", + return_value=(200, {}, {}, b"some data")) as mock_handle: + + assert gcs.bucket_type == gcs.BucketType.UNKNOWN + # Use an existing file from the test setup + test_file = f"{TEST_BUCKET}/test/accounts.1.json" + + # This call should NOT be routed to the adapter + data = gcs.cat(test_file) + assert data == files["test/accounts.1.json"] + mock_handle.assert_not_called() From 409c86e461600442f45156329f0879d2f2ad4edb Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 6 Oct 2025 12:36:14 +0000 Subject: [PATCH 06/59] minor fix --- gcsfs/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index be4a948f..15cc488f 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1728,7 +1728,7 @@ def test_storage_layout_not_called_when_toggle_is_default_off(gcs_factory): mock_get_layout.assert_not_called() # Verify that the bucket_type attribute on the gcs instance is UNKNOWN - assert gcs.bucket_type == "UNKNOWN" + assert gcs.bucket_type == gcs.BucketType.UNKNOWN def test_storage_layout_zonal(gcs_factory): with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout: From 8a4ca85d54c53a3b5334487e9efab4780900cfea Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sat, 11 Oct 2025 19:06:56 +0000 Subject: [PATCH 07/59] moves mrd creation inside core.py --- gcsfs/core.py | 111 +++++++++++++++++++++++++++++++------------------ gcsfs/zonal.py | 73 ++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 41 deletions(-) create mode 100644 gcsfs/zonal.py diff --git a/gcsfs/core.py b/gcsfs/core.py index a6d935ca..343ee6f9 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -32,6 +32,9 @@ from .retry import errs, retry_request, validate_response from .gcs_adapter import GCSAdapter from enum import Enum +from google.cloud.storage._experimental.asyncio.async_grpc_client import \ + AsyncGrpcClient +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import (AsyncMultiRangeDownloader) logger = logging.getLogger("gcsfs") @@ -348,14 +351,6 @@ def __init__( self.experimental_zb_hns_support = experimental_zb_hns_support self._storage_layout_cache = {} - if self.experimental_zb_hns_support: - try: - self.bucket_type = self._sync_get_storage_layout(project) - except Exception as e: - logger.warning( - f"Failed to get storage layout for bucket {project}: {e}" - ) - if check_connection: warnings.warn( "The `check_connection` argument is deprecated and will be removed in a future release.", @@ -367,26 +362,36 @@ def __init__( ) async def _get_storage_layout(self, bucket): - if bucket in self._storage_layout_cache: - return self._storage_layout_cache[bucket] + if self.experimental_zb_hns_support: + if bucket in self._storage_layout_cache: + return self._storage_layout_cache[bucket] - try: - response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) - if response.get("locationType") == "zone": - bucket_type = self.BucketType.ZONAL_HIERARCHICAL - else: - # This should be updated to include HNS in the future - bucket_type = self.BucketType.NON_HIERARCHICAL - self._storage_layout_cache[bucket] = bucket_type - return bucket_type - except Exception as e: - logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") - # Default to UNKNOWN - self._storage_layout_cache[bucket] = self.BucketType.UNKNOWN + try: + response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) + if response.get("locationType") == "zone": + bucket_type = self.BucketType.ZONAL_HIERARCHICAL + else: + # This should be updated to include HNS in the future + bucket_type = self.BucketType.NON_HIERARCHICAL + self._storage_layout_cache[bucket] = bucket_type + return bucket_type + except Exception as e: + logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") + # Default to UNKNOWN + self._storage_layout_cache[bucket] = self.BucketType.UNKNOWN + return self.BucketType.UNKNOWN + else: return self.BucketType.UNKNOWN _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + async def createMRD(self, grpcclient, bucket_name, name): + mrd = AsyncMultiRangeDownloader(grpcclient, bucket_name, name) + await mrd.open() + return mrd + + _sync_createMRD = asyn.sync_wrapper(createMRD) + @property def _location(self): return self._endpoint or _location() @@ -505,11 +510,11 @@ def _format_path(self, path, args): async def _request( self, method, path, *args, headers=None, json=None, data=None, **kwargs ): - if self.bucket_type == self.BucketType.ZONAL_HIERARCHICAL: - adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) - result = adapter.handle(method, self._format_path(path, args), **kwargs) - if result is not None: - return result + # if self.bucket_type == self.BucketType.ZONAL_HIERARCHICAL: + #adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) + #result = adapter.handle(method, self._format_path(path, args), **kwargs) + # if result is not None: + # return result await self._set_session() if hasattr(data, "seek"): @@ -1733,19 +1738,43 @@ def _open( if block_size is None: block_size = self.default_block_size const = consistency or self.consistency - return GCSFile( - self, - path, - mode, - block_size, - cache_options=cache_options, - consistency=const, - metadata=metadata, - acl=acl, - autocommit=autocommit, - fixed_key_metadata=fixed_key_metadata, - **kwargs, - ) + bucket, key, path_generation = self.split_path(path) + bucket_type = self._sync_get_storage_layout(bucket) + if bucket_type == self.BucketType.ZONAL_HIERARCHICAL: + # grpc client location needs to be discussed so same client can be used for different streams + client = AsyncGrpcClient()._grpc_client + bucket_name, name, _ = self.split_path(path) + mrd = self._sync_createMRD(client, bucket_name, name) + # TODO remove circular import issue + from .zonal import GCSZonalFile + return GCSZonalFile( + self, + path, + mode, + block_size, + cache_options=cache_options, + consistency=const, + metadata=metadata, + acl=acl, + autocommit=autocommit, + fixed_key_metadata=fixed_key_metadata, + mrd=mrd, + **kwargs, + ) + else: + return GCSFile( + self, + path, + mode, + block_size, + cache_options=cache_options, + consistency=const, + metadata=metadata, + acl=acl, + autocommit=autocommit, + fixed_key_metadata=fixed_key_metadata, + **kwargs, + ) @classmethod def _split_path(cls, path, version_aware=False): diff --git a/gcsfs/zonal.py b/gcsfs/zonal.py new file mode 100644 index 00000000..37add403 --- /dev/null +++ b/gcsfs/zonal.py @@ -0,0 +1,73 @@ +import logging +from io import BytesIO + +from fsspec import asyn + +from .core import GCSFile + +logger = logging.getLogger("gcsfs") + + +class GCSZonalFile(GCSFile): + """ + A GCSFile subclass for Zonal Storage buckets. + + This may have custom logic for operations like fetching data ranges. + """ + + def __init__( + self, + gcsfs, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_type="readahead", + cache_options=None, + acl=None, + consistency="md5", + metadata=None, + content_type=None, + timeout=None, + fixed_key_metadata=None, + generation=None, + kms_key_name=None, + mrd=None, + **kwargs, + ): + self.mrd = mrd + super().__init__( + gcsfs, + path, + mode=mode, + block_size=block_size, + autocommit=autocommit, + cache_type=cache_type, + cache_options=cache_options, + acl=acl, + consistency=consistency, + metadata=metadata, + content_type=content_type, + timeout=timeout, + fixed_key_metadata=fixed_key_metadata, + generation=generation, + kms_key_name=kms_key_name, + **kwargs, + ) + + def _fetch_range(self, start=None, end=None): + """ + Get data from a Zonal Storage bucket. + + This method would contain custom logic optimized for zonal reads. + """ + # Placeholder for custom zonal fetch logic. + logger.info("--- GCSZonalFile._fetch_range is called ---") + buffer = BytesIO() + sync_download_ranges = asyn.sync_wrapper(self.mrd.download_ranges) + offset = start + length = (end - start) + 1 + results = sync_download_ranges([(offset, length, buffer)]) + downloaded_data = buffer.getvalue() + print(downloaded_data) + return downloaded_data From c5a9eae82039360a5e2cb973794aba719a7fbbf1 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sun, 12 Oct 2025 09:06:53 +0000 Subject: [PATCH 08/59] moves mrd creation to separate GCSFile --- .gitignore | 1 + gcsfs/core.py | 4 ++-- gcsfs/zonal.py | 44 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 023f552c..9cc47b5e 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,4 @@ target/ .pytest_cache/ libs/*.whl +*zonal_test.py diff --git a/gcsfs/core.py b/gcsfs/core.py index 343ee6f9..3917c10c 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -1744,7 +1744,7 @@ def _open( # grpc client location needs to be discussed so same client can be used for different streams client = AsyncGrpcClient()._grpc_client bucket_name, name, _ = self.split_path(path) - mrd = self._sync_createMRD(client, bucket_name, name) + # mrd = self._sync_createMRD(client, bucket_name, name) # TODO remove circular import issue from .zonal import GCSZonalFile return GCSZonalFile( @@ -1758,7 +1758,7 @@ def _open( acl=acl, autocommit=autocommit, fixed_key_metadata=fixed_key_metadata, - mrd=mrd, + client=client, **kwargs, ) else: diff --git a/gcsfs/zonal.py b/gcsfs/zonal.py index 37add403..a683e547 100644 --- a/gcsfs/zonal.py +++ b/gcsfs/zonal.py @@ -1,7 +1,11 @@ +import asyncio import logging +import threading from io import BytesIO from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ + AsyncMultiRangeDownloader from .core import GCSFile @@ -32,11 +36,15 @@ def __init__( fixed_key_metadata=None, generation=None, kms_key_name=None, - mrd=None, + client=None, **kwargs, ): - self.mrd = mrd - super().__init__( + self.gcsfs = gcsfs + bucket_name, name, _ = self.gcsfs.split_path(path) + self.mrd = AsyncMultiRangeDownloader(client, bucket_name, name) + self._run_async_in_new_loop(self.mrd.open()) + GCSFile.__init__( + self, gcsfs, path, mode=mode, @@ -64,10 +72,36 @@ def _fetch_range(self, start=None, end=None): # Placeholder for custom zonal fetch logic. logger.info("--- GCSZonalFile._fetch_range is called ---") buffer = BytesIO() - sync_download_ranges = asyn.sync_wrapper(self.mrd.download_ranges) offset = start length = (end - start) + 1 - results = sync_download_ranges([(offset, length, buffer)]) + self._run_async_in_new_loop(self.mrd.download_ranges([(offset, length, buffer)])) downloaded_data = buffer.getvalue() print(downloaded_data) return downloaded_data + + def _run_async_in_new_loop(self, coro): + """ + Runs a coroutine in a new event loop in a separate thread. + This is a workaround to avoid deadlocks when calling async code from a + sync method that is itself running inside an outer event loop. + """ + result = None + exception = None + + def runner(): + nonlocal result, exception + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = asyncio.run_coroutine_threadsafe(coro, loop) + loop.close() + except Exception as e: + exception = e + + thread = threading.Thread(target=runner) + thread.start() + thread.join() + + if exception: + raise exception + return result From e95495f0e1781b9a544eb881af67275c0bf6d2aa Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sun, 12 Oct 2025 09:11:18 +0000 Subject: [PATCH 09/59] minor comment --- gcsfs/zonal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcsfs/zonal.py b/gcsfs/zonal.py index a683e547..29cacdac 100644 --- a/gcsfs/zonal.py +++ b/gcsfs/zonal.py @@ -82,7 +82,7 @@ def _fetch_range(self, start=None, end=None): def _run_async_in_new_loop(self, coro): """ Runs a coroutine in a new event loop in a separate thread. - This is a workaround to avoid deadlocks when calling async code from a + This is done to avoid any deadlocks when calling async code from a sync method that is itself running inside an outer event loop. """ result = None From 95440d50aa44da13f8aea7b8eb0838f7f4d66c96 Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Sat, 11 Oct 2025 17:51:29 +0000 Subject: [PATCH 10/59] Extend gcsfs to create new filesystem Extend gcsfs to create new filesystem. async download --- gcsfs/core.py | 70 +++++++----------------- gcsfs/zonal_handler.py | 46 ++++++++++++++++ gcsfs/zonalfilesystem.py | 114 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 51 deletions(-) create mode 100644 gcsfs/zonal_handler.py create mode 100644 gcsfs/zonalfilesystem.py diff --git a/gcsfs/core.py b/gcsfs/core.py index 3917c10c..c0f0caa0 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -287,20 +287,18 @@ class GCSFileSystem(asyn.AsyncFileSystem): protocol = "gs", "gcs" async_impl = True - class BucketType(Enum): - ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" - HIERARCHICAL = "HIERARCHICAL" - NON_HIERARCHICAL = "NON_HIERARCHICAL" - UNKNOWN = "UNKNOWN" - - bucket_type = BucketType.UNKNOWN - _adapter = None + def __new__(cls, *args, **kwargs): + """ + Factory to return a ZonalFileSystem instance if the experimental + flag is enabled. + """ + experimental_support = kwargs.pop('experimental_zb_hns_support', False) - @classmethod - def _get_adapter(cls, **kwargs): - if cls._adapter is None: - cls._adapter = GCSAdapter(**kwargs) - return cls._adapter + if experimental_support: + from .gcsfilesystem_adapter import GCSFileSystemAdapter + return object.__new__(GCSFileSystemAdapter) + else: + return object.__new__(cls) def __init__( self, @@ -349,7 +347,14 @@ def __init__( self.default_location = default_location self.version_aware = version_aware self.experimental_zb_hns_support = experimental_zb_hns_support - self._storage_layout_cache = {} + + if self.experimental_zb_hns_support: + try: + self.bucket_type = self._sync_get_storage_layout(project) + except Exception as e: + logger.warning( + f"Failed to get storage layout for bucket {project}: {e}" + ) if check_connection: warnings.warn( @@ -361,37 +366,6 @@ def __init__( project, access, token, on_google=self.on_google ) - async def _get_storage_layout(self, bucket): - if self.experimental_zb_hns_support: - if bucket in self._storage_layout_cache: - return self._storage_layout_cache[bucket] - - try: - response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) - if response.get("locationType") == "zone": - bucket_type = self.BucketType.ZONAL_HIERARCHICAL - else: - # This should be updated to include HNS in the future - bucket_type = self.BucketType.NON_HIERARCHICAL - self._storage_layout_cache[bucket] = bucket_type - return bucket_type - except Exception as e: - logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") - # Default to UNKNOWN - self._storage_layout_cache[bucket] = self.BucketType.UNKNOWN - return self.BucketType.UNKNOWN - else: - return self.BucketType.UNKNOWN - - _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) - - async def createMRD(self, grpcclient, bucket_name, name): - mrd = AsyncMultiRangeDownloader(grpcclient, bucket_name, name) - await mrd.open() - return mrd - - _sync_createMRD = asyn.sync_wrapper(createMRD) - @property def _location(self): return self._endpoint or _location() @@ -510,12 +484,6 @@ def _format_path(self, path, args): async def _request( self, method, path, *args, headers=None, json=None, data=None, **kwargs ): - # if self.bucket_type == self.BucketType.ZONAL_HIERARCHICAL: - #adapter = self._get_adapter(project=self.project, token=self.credentials.credentials) - #result = adapter.handle(method, self._format_path(path, args), **kwargs) - # if result is not None: - # return result - await self._set_session() if hasattr(data, "seek"): data.seek(0) diff --git a/gcsfs/zonal_handler.py b/gcsfs/zonal_handler.py new file mode 100644 index 00000000..a5ea7a63 --- /dev/null +++ b/gcsfs/zonal_handler.py @@ -0,0 +1,46 @@ +from .core import GCSFile +from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from io import BytesIO + +class ZonalHandler(): + """ + Handler for Zonal bucket operations. + """ + + def __init__(self, gcsfilesystemadapter, **kwargs): + """ + Initializes the ZonalFile object. + """ + self.fs = gcsfilesystemadapter + self.mrd_cache = {} + + async def _get_downloader(self, bucket, object, generation=None): + """ + Initializes the AsyncMultiRangeDownloader. + """ + if self.fs.grpc_client is None: + self.fs.grpc_client = AsyncGrpcClient().grpc_client + + if self.mrd_cache.get((bucket, object, generation)): + return self.mrd_cache[(bucket, object, generation)] + + downloader = await AsyncMultiRangeDownloader.create_mrd( + self.fs.grpc_client, bucket, object, generation + ) + self.mrd_cache[(bucket, object, generation)] = downloader + return downloader + + async def download_range(self,path, start, end): + """ + Downloads a byte range from the file asynchronously. + """ + bucket, object, generation = self.fs.split_path(path) + mrd = await self._get_downloader(bucket, object, generation) + offset = start + length = end - start + 1 + buffer = BytesIO() + results = await mrd.download_ranges([(offset, length, buffer)]) + return buffer.getvalue() + \ No newline at end of file diff --git a/gcsfs/zonalfilesystem.py b/gcsfs/zonalfilesystem.py new file mode 100644 index 00000000..c8211ec2 --- /dev/null +++ b/gcsfs/zonalfilesystem.py @@ -0,0 +1,114 @@ +import logging +from enum import Enum + +from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader + +from .core import GCSFileSystem, GCSFile +from google.cloud.storage._experimental.asyncio.async_grpc_client import (AsyncGrpcClient) + +from .gcs_adapter import GCSAdapter +from .zonalfile import ZonalFile + +logger = logging.getLogger("gcsfs") + +class BucketType(Enum): + ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" + HIERARCHICAL = "HIERARCHICAL" + NON_HIERARCHICAL = "NON_HIERARCHICAL" + UNKNOWN = "UNKNOWN" + +gcs_file_types = { + BucketType.ZONAL_HIERARCHICAL: ZonalFile, + BucketType.UNKNOWN : GCSFile, +} + +class ZonalFileSystem(GCSFileSystem): + """ + An experimental subclass of GCSFileSystem that will contain specialized + logic for Zonal and HNS buckets. + """ + def __init__(self, *args, **kwargs): + # Ensure the experimental flag is not passed down again + kwargs.pop('experimental_zb_hns_support', None) + super().__init__(*args, **kwargs) + self.grpc_client = AsyncGrpcClient().grpc_client + self._storage_layout_cache = {} + print("ZonalFileSystem initialized") + + async def _get_storage_layout(self, bucket): + print("Getting storage layout for bucket {}".format(bucket)) + if bucket in self._storage_layout_cache: + return self._storage_layout_cache[bucket] + try: + response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) + if response.get("locationType") == "zone": + bucket_type = BucketType.ZONAL_HIERARCHICAL + else: + # This should be updated to include HNS in the future + bucket_type = BucketType.NON_HIERARCHICAL + self._storage_layout_cache[bucket] = bucket_type + print("Getting storage layout for bucket {}".format(bucket)) + return bucket_type + except Exception as e: + logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") + # Default to UNKNOWN + self._storage_layout_cache[bucket] = BucketType.UNKNOWN + return BucketType.UNKNOWN + + _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + + async def _init_async_multi_range_reader(self, client): + """Initializes the reader if the bucket is Zonal [4].""" + + print("Initializing AsyncMultiRangeReader") + async_multi_range_reader = await AsyncMultiRangeDownloader.create_mrd( + client, bucket_name="chandrasiri-rs", object_name="sunidhi" + ) + print("Initialized AsyncMultiRangeReader") + # return async_multi_range_reader + return client + + init_mrd = asyn.sync_wrapper(_init_async_multi_range_reader) + def _open( + self, + path, + mode="rb", + **kwargs, + ): + """ + Open a file. + """ + print(f"ZonalFileSystem._open() called for path: {path}") + bucket, _, _ = self.split_path(path) + print(f"Bucket: {bucket}") + try: + bucket_type = self._sync_get_storage_layout(bucket) + if bucket_type == BucketType.ZONAL_HIERARCHICAL: + print("Creating mrd {}".format(bucket)) + # mrd = self.init_mrd(self.grpc_client) + print("Opening ZonalFile") + # return ZonalFile(path=path, bucket=bucket, mrd=mrd, **kwargs) + except Exception as e: + logger.warning( + f"Failed to get storage layout for bucket {bucket}: {e}" + ) + return super()._open(path, mode, **kwargs) + + def _fetch_range(self, start=None, end=None): + """Get data from GCS + start, end : None or integers + if not both None, fetch only given range + """ + print("ZonalFileSystem._fetch_range() called") + + async def _cat_file(self, path, start=None, end=None, **kwargs): + """Simple one-shot get of file data""" + print("ZonalFileSystem._cat_file() called") + client = AsyncGrpcClient().grpc_client + mrd = await AsyncMultiRangeDownloader.create_mrd( + client, bucket_name="chandrasiri-rs", object_name="sunidhi" + ) + return b'This is an output' + + # def _upl \ No newline at end of file From dfcbb7fc6d3fcf93b31b7f67701de43766650604 Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Sat, 11 Oct 2025 18:55:39 +0000 Subject: [PATCH 11/59] Extend gcsfs and gcsfile to override methods. download happens synchronously --- gcsfs/core.py | 63 ++++-------------- gcsfs/gcs_adapter.py | 41 ------------ gcsfs/gcsfilesystem_adapter.py | 68 ++++++++++++++++++++ gcsfs/zonal.py | 107 ------------------------------- gcsfs/zonal_file.py | 46 +++++++++++++ gcsfs/zonal_handler.py | 46 ------------- gcsfs/zonalfilesystem.py | 114 --------------------------------- 7 files changed, 127 insertions(+), 358 deletions(-) delete mode 100644 gcsfs/gcs_adapter.py create mode 100644 gcsfs/gcsfilesystem_adapter.py delete mode 100644 gcsfs/zonal.py create mode 100644 gcsfs/zonal_file.py delete mode 100644 gcsfs/zonal_handler.py delete mode 100644 gcsfs/zonalfilesystem.py diff --git a/gcsfs/core.py b/gcsfs/core.py index c0f0caa0..0b03e2a5 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -30,11 +30,6 @@ from .credentials import GoogleCredentials from .inventory_report import InventoryReport from .retry import errs, retry_request, validate_response -from .gcs_adapter import GCSAdapter -from enum import Enum -from google.cloud.storage._experimental.asyncio.async_grpc_client import \ - AsyncGrpcClient -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import (AsyncMultiRangeDownloader) logger = logging.getLogger("gcsfs") @@ -348,14 +343,6 @@ def __init__( self.version_aware = version_aware self.experimental_zb_hns_support = experimental_zb_hns_support - if self.experimental_zb_hns_support: - try: - self.bucket_type = self._sync_get_storage_layout(project) - except Exception as e: - logger.warning( - f"Failed to get storage layout for bucket {project}: {e}" - ) - if check_connection: warnings.warn( "The `check_connection` argument is deprecated and will be removed in a future release.", @@ -1706,43 +1693,19 @@ def _open( if block_size is None: block_size = self.default_block_size const = consistency or self.consistency - bucket, key, path_generation = self.split_path(path) - bucket_type = self._sync_get_storage_layout(bucket) - if bucket_type == self.BucketType.ZONAL_HIERARCHICAL: - # grpc client location needs to be discussed so same client can be used for different streams - client = AsyncGrpcClient()._grpc_client - bucket_name, name, _ = self.split_path(path) - # mrd = self._sync_createMRD(client, bucket_name, name) - # TODO remove circular import issue - from .zonal import GCSZonalFile - return GCSZonalFile( - self, - path, - mode, - block_size, - cache_options=cache_options, - consistency=const, - metadata=metadata, - acl=acl, - autocommit=autocommit, - fixed_key_metadata=fixed_key_metadata, - client=client, - **kwargs, - ) - else: - return GCSFile( - self, - path, - mode, - block_size, - cache_options=cache_options, - consistency=const, - metadata=metadata, - acl=acl, - autocommit=autocommit, - fixed_key_metadata=fixed_key_metadata, - **kwargs, - ) + return GCSFile( + self, + path, + mode, + block_size, + cache_options=cache_options, + consistency=const, + metadata=metadata, + acl=acl, + autocommit=autocommit, + fixed_key_metadata=fixed_key_metadata, + **kwargs, + ) @classmethod def _split_path(cls, path, version_aware=False): diff --git a/gcsfs/gcs_adapter.py b/gcsfs/gcs_adapter.py deleted file mode 100644 index f62e865b..00000000 --- a/gcsfs/gcs_adapter.py +++ /dev/null @@ -1,41 +0,0 @@ -from google.cloud import storage -import re - -class GCSAdapter: - def __init__(self, project=None, token=None): - self.client = storage.Client(project=project, credentials=token) - - def _parse_path(self, path): - match = re.match(r"https://storage.googleapis.com/storage/v1/b/([^/]+)/o/([^?]+)", path) - if match: - return match.groups() - return None, None - - def read_object(self, path, **kwargs): - bucket_name, blob_name = self._parse_path(path) - if not bucket_name or not blob_name: - raise ValueError(f"Invalid GCS path: {path}") - - bucket = self.client.bucket(bucket_name) - blob = bucket.blob(blob_name) - - # Assuming the 'read' operation is for downloading the object's content - try: - content = blob.download_as_bytes() - # The original _request method returns a tuple, so we mimic that structure - # (status, headers, info, contents) - return 200, {}, {}, content - except Exception as e: - # Mimic a basic error response - return 404, {}, {}, str(e).encode() - - def handle(self, method, path, **kwargs): - # Check if the request is a read operation. - # A simple check is to see if the method is 'GET' and if it's a media download link. - is_read_request = method.upper() == 'GET' and "alt=media" in path - - if is_read_request: - return self.read_object(path, **kwargs) - - # If it's not a read request this adapter should handle, do nothing. - return None \ No newline at end of file diff --git a/gcsfs/gcsfilesystem_adapter.py b/gcsfs/gcsfilesystem_adapter.py new file mode 100644 index 00000000..cbd0a361 --- /dev/null +++ b/gcsfs/gcsfilesystem_adapter.py @@ -0,0 +1,68 @@ +import logging +from enum import Enum + +from fsspec import asyn + +from .core import GCSFileSystem, GCSFile +from .zonal_file import ZonalFile +from io import BytesIO +import asyncio + +logger = logging.getLogger("gcsfs") + +class BucketType(Enum): + ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" + HIERARCHICAL = "HIERARCHICAL" + NON_HIERARCHICAL = "NON_HIERARCHICAL" + UNKNOWN = "UNKNOWN" + +gcs_file_types = { + BucketType.ZONAL_HIERARCHICAL: ZonalFile, + BucketType.UNKNOWN : GCSFile, + None : GCSFile, +} + +class GCSFileSystemAdapter(GCSFileSystem): + """ + An subclass of GCSFileSystem that will contain specialized + logic for Zonal and HNS buckets. + """ + def __init__(self, *args, **kwargs): + kwargs.pop('experimental_zb_hns_support', None) + super().__init__(*args, **kwargs) + self.grpc_client = None + self._storage_layout_cache = {} + + async def _get_storage_layout(self, bucket): + if bucket in self._storage_layout_cache: + return self._storage_layout_cache[bucket] + try: + response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) + if response.get("locationType") == "zone": + bucket_type = BucketType.ZONAL_HIERARCHICAL + else: + # This should be updated to include HNS in the future + bucket_type = BucketType.NON_HIERARCHICAL + self._storage_layout_cache[bucket] = bucket_type + return bucket_type + except Exception as e: + logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") + # Default to UNKNOWN + self._storage_layout_cache[bucket] = BucketType.UNKNOWN + return BucketType.UNKNOWN + + _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + + + def _open( + self, + path, + mode="rb", + **kwargs, + ): + """ + Open a file. + """ + bucket, _, _ = self.split_path(path) + bucket_type = self._sync_get_storage_layout(bucket) + return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) diff --git a/gcsfs/zonal.py b/gcsfs/zonal.py deleted file mode 100644 index 29cacdac..00000000 --- a/gcsfs/zonal.py +++ /dev/null @@ -1,107 +0,0 @@ -import asyncio -import logging -import threading -from io import BytesIO - -from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ - AsyncMultiRangeDownloader - -from .core import GCSFile - -logger = logging.getLogger("gcsfs") - - -class GCSZonalFile(GCSFile): - """ - A GCSFile subclass for Zonal Storage buckets. - - This may have custom logic for operations like fetching data ranges. - """ - - def __init__( - self, - gcsfs, - path, - mode="rb", - block_size=None, - autocommit=True, - cache_type="readahead", - cache_options=None, - acl=None, - consistency="md5", - metadata=None, - content_type=None, - timeout=None, - fixed_key_metadata=None, - generation=None, - kms_key_name=None, - client=None, - **kwargs, - ): - self.gcsfs = gcsfs - bucket_name, name, _ = self.gcsfs.split_path(path) - self.mrd = AsyncMultiRangeDownloader(client, bucket_name, name) - self._run_async_in_new_loop(self.mrd.open()) - GCSFile.__init__( - self, - gcsfs, - path, - mode=mode, - block_size=block_size, - autocommit=autocommit, - cache_type=cache_type, - cache_options=cache_options, - acl=acl, - consistency=consistency, - metadata=metadata, - content_type=content_type, - timeout=timeout, - fixed_key_metadata=fixed_key_metadata, - generation=generation, - kms_key_name=kms_key_name, - **kwargs, - ) - - def _fetch_range(self, start=None, end=None): - """ - Get data from a Zonal Storage bucket. - - This method would contain custom logic optimized for zonal reads. - """ - # Placeholder for custom zonal fetch logic. - logger.info("--- GCSZonalFile._fetch_range is called ---") - buffer = BytesIO() - offset = start - length = (end - start) + 1 - self._run_async_in_new_loop(self.mrd.download_ranges([(offset, length, buffer)])) - downloaded_data = buffer.getvalue() - print(downloaded_data) - return downloaded_data - - def _run_async_in_new_loop(self, coro): - """ - Runs a coroutine in a new event loop in a separate thread. - This is done to avoid any deadlocks when calling async code from a - sync method that is itself running inside an outer event loop. - """ - result = None - exception = None - - def runner(): - nonlocal result, exception - try: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - result = asyncio.run_coroutine_threadsafe(coro, loop) - loop.close() - except Exception as e: - exception = e - - thread = threading.Thread(target=runner) - thread.start() - thread.join() - - if exception: - raise exception - return result diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py new file mode 100644 index 00000000..ad1d1122 --- /dev/null +++ b/gcsfs/zonal_file.py @@ -0,0 +1,46 @@ +from .core import GCSFile +from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from io import BytesIO + +class ZonalFile(GCSFile): + """ + GCSFile subclass designed to handle reads from + Zonal buckets using a high-performance gRPC path. + """ + def __init__(self, *args, **kwargs): + """ + Initializes the ZonalFile object. + """ + super().__init__(*args, **kwargs) + self.mrd = asyn.sync(self.gcsfs.loop, self._get_downloader, self.bucket, self.key, self.generation) + + async def _get_downloader(self, bucket_name, object_name, generation=None): + """ + Initializes the AsyncMultiRangeDownloader. + """ + if self.gcsfs.grpc_client is None: + self.gcsfs.grpc_client = AsyncGrpcClient().grpc_client + + downloader = await AsyncMultiRangeDownloader.create_mrd( + self.gcsfs.grpc_client, bucket_name, object_name, generation + ) + return downloader + + async def download_range(self,path, start, end): + """ + Downloads a byte range from the file asynchronously. + """ + offset = start + length = end - start + 1 + buffer = BytesIO() + results = await self.mrd.download_ranges([(offset, length, buffer)]) + return buffer.getvalue() + + def _fetch_range(self, start, end): + """ + Overrides the default _fetch_range to implement the gRPC read path. + + """ + return asyn.sync(self.gcsfs.loop, self.download_range, self.path, start, end) diff --git a/gcsfs/zonal_handler.py b/gcsfs/zonal_handler.py deleted file mode 100644 index a5ea7a63..00000000 --- a/gcsfs/zonal_handler.py +++ /dev/null @@ -1,46 +0,0 @@ -from .core import GCSFile -from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader -from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient -from io import BytesIO - -class ZonalHandler(): - """ - Handler for Zonal bucket operations. - """ - - def __init__(self, gcsfilesystemadapter, **kwargs): - """ - Initializes the ZonalFile object. - """ - self.fs = gcsfilesystemadapter - self.mrd_cache = {} - - async def _get_downloader(self, bucket, object, generation=None): - """ - Initializes the AsyncMultiRangeDownloader. - """ - if self.fs.grpc_client is None: - self.fs.grpc_client = AsyncGrpcClient().grpc_client - - if self.mrd_cache.get((bucket, object, generation)): - return self.mrd_cache[(bucket, object, generation)] - - downloader = await AsyncMultiRangeDownloader.create_mrd( - self.fs.grpc_client, bucket, object, generation - ) - self.mrd_cache[(bucket, object, generation)] = downloader - return downloader - - async def download_range(self,path, start, end): - """ - Downloads a byte range from the file asynchronously. - """ - bucket, object, generation = self.fs.split_path(path) - mrd = await self._get_downloader(bucket, object, generation) - offset = start - length = end - start + 1 - buffer = BytesIO() - results = await mrd.download_ranges([(offset, length, buffer)]) - return buffer.getvalue() - \ No newline at end of file diff --git a/gcsfs/zonalfilesystem.py b/gcsfs/zonalfilesystem.py deleted file mode 100644 index c8211ec2..00000000 --- a/gcsfs/zonalfilesystem.py +++ /dev/null @@ -1,114 +0,0 @@ -import logging -from enum import Enum - -from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader - -from .core import GCSFileSystem, GCSFile -from google.cloud.storage._experimental.asyncio.async_grpc_client import (AsyncGrpcClient) - -from .gcs_adapter import GCSAdapter -from .zonalfile import ZonalFile - -logger = logging.getLogger("gcsfs") - -class BucketType(Enum): - ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" - HIERARCHICAL = "HIERARCHICAL" - NON_HIERARCHICAL = "NON_HIERARCHICAL" - UNKNOWN = "UNKNOWN" - -gcs_file_types = { - BucketType.ZONAL_HIERARCHICAL: ZonalFile, - BucketType.UNKNOWN : GCSFile, -} - -class ZonalFileSystem(GCSFileSystem): - """ - An experimental subclass of GCSFileSystem that will contain specialized - logic for Zonal and HNS buckets. - """ - def __init__(self, *args, **kwargs): - # Ensure the experimental flag is not passed down again - kwargs.pop('experimental_zb_hns_support', None) - super().__init__(*args, **kwargs) - self.grpc_client = AsyncGrpcClient().grpc_client - self._storage_layout_cache = {} - print("ZonalFileSystem initialized") - - async def _get_storage_layout(self, bucket): - print("Getting storage layout for bucket {}".format(bucket)) - if bucket in self._storage_layout_cache: - return self._storage_layout_cache[bucket] - try: - response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) - if response.get("locationType") == "zone": - bucket_type = BucketType.ZONAL_HIERARCHICAL - else: - # This should be updated to include HNS in the future - bucket_type = BucketType.NON_HIERARCHICAL - self._storage_layout_cache[bucket] = bucket_type - print("Getting storage layout for bucket {}".format(bucket)) - return bucket_type - except Exception as e: - logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") - # Default to UNKNOWN - self._storage_layout_cache[bucket] = BucketType.UNKNOWN - return BucketType.UNKNOWN - - _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) - - async def _init_async_multi_range_reader(self, client): - """Initializes the reader if the bucket is Zonal [4].""" - - print("Initializing AsyncMultiRangeReader") - async_multi_range_reader = await AsyncMultiRangeDownloader.create_mrd( - client, bucket_name="chandrasiri-rs", object_name="sunidhi" - ) - print("Initialized AsyncMultiRangeReader") - # return async_multi_range_reader - return client - - init_mrd = asyn.sync_wrapper(_init_async_multi_range_reader) - def _open( - self, - path, - mode="rb", - **kwargs, - ): - """ - Open a file. - """ - print(f"ZonalFileSystem._open() called for path: {path}") - bucket, _, _ = self.split_path(path) - print(f"Bucket: {bucket}") - try: - bucket_type = self._sync_get_storage_layout(bucket) - if bucket_type == BucketType.ZONAL_HIERARCHICAL: - print("Creating mrd {}".format(bucket)) - # mrd = self.init_mrd(self.grpc_client) - print("Opening ZonalFile") - # return ZonalFile(path=path, bucket=bucket, mrd=mrd, **kwargs) - except Exception as e: - logger.warning( - f"Failed to get storage layout for bucket {bucket}: {e}" - ) - return super()._open(path, mode, **kwargs) - - def _fetch_range(self, start=None, end=None): - """Get data from GCS - start, end : None or integers - if not both None, fetch only given range - """ - print("ZonalFileSystem._fetch_range() called") - - async def _cat_file(self, path, start=None, end=None, **kwargs): - """Simple one-shot get of file data""" - print("ZonalFileSystem._cat_file() called") - client = AsyncGrpcClient().grpc_client - mrd = await AsyncMultiRangeDownloader.create_mrd( - client, bucket_name="chandrasiri-rs", object_name="sunidhi" - ) - return b'This is an output' - - # def _upl \ No newline at end of file From 20b36fc333563ed2bba363abe9288df3f8c9d94a Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 15 Oct 2025 08:35:16 +0000 Subject: [PATCH 12/59] fixes new test in test_core.py --- gcsfs/tests/test_core.py | 89 +++++----------------------------------- 1 file changed, 10 insertions(+), 79 deletions(-) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 15cc488f..0a36f9dd 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -18,6 +18,7 @@ from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials +from gcsfs.gcsfilesystem_adapter import GCSFileSystemAdapter from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files from gcsfs.tests.utils import tempdir, tmpfile @@ -1704,86 +1705,16 @@ def test_get_error(gcs): gcs.get_file(f"{TEST_BUCKET}/doesnotexist", "other") -def test_storage_layout_called_when_toggle_is_on(gcs_factory): - with mock.patch( - "gcsfs.core.GCSFileSystem._sync_get_storage_layout", - return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL, - ) as mock_get_layout: - gcs = gcs_factory(experimental_zb_hns_support=True) +def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory): + gcs = gcs_factory() - # Verify that _get_storage_layout was called - mock_get_layout.assert_called_once() + assert isinstance(gcs, gcsfs.GCSFileSystem), "Expected File system instance to be GCSFileSystem" + assert not isinstance(gcs, + gcsfs.gcsfilesystem_adapter.GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystem" - # Verify that the bucket_type attribute on the gcs instance is set - assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL +def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): + gcs = gcs_factory(experimental_zb_hns_support=True) + assert isinstance(gcs, + gcsfs.gcsfilesystem_adapter.GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystemAdapter" -def test_storage_layout_not_called_when_toggle_is_default_off(gcs_factory): - with mock.patch( - "gcsfs.core.GCSFileSystem._sync_get_storage_layout" - ) as mock_get_layout: - gcs = gcs_factory() - - # Verify that _get_storage_layout was not called - mock_get_layout.assert_not_called() - - # Verify that the bucket_type attribute on the gcs instance is UNKNOWN - assert gcs.bucket_type == gcs.BucketType.UNKNOWN - -def test_storage_layout_zonal(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout: - gcs = gcs_factory(experimental_zb_hns_support=True) - - # Verify that _get_storage_layout was called with the BUCKET name - mock_get_layout.assert_called_once() - - # Verify that the bucket_type attribute on the gcs instance is set to ZONAL - assert gcs.bucket_type == gcs.BucketType.ZONAL_HIERARCHICAL - - -def test_default_bucket_type(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: - gcs = gcs_factory() - - # Verify that _get_storage_layout was called with the BUCKET name - mock_get_layout.assert_not_called() - - # Verify that the bucket_type attribute on the gcs instance is set to ZONAL - assert gcs.bucket_type == gcs.BucketType.UNKNOWN - -def test_non_hierarchical_bucket_type(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.NON_HIERARCHICAL) as mock_get_layout: - gcs = gcs_factory(experimental_zb_hns_support=True) - - # Verify that _get_storage_layout was called with the BUCKET name - mock_get_layout.assert_called_once() - - # Verify that the bucket_type attribute on the gcs instance is set to ZONAL - assert gcs.bucket_type == gcs.BucketType.NON_HIERARCHICAL - -def test_gcs_adapter_called_for_zonal_bucket(gcs_factory): - with mock.patch("gcsfs.core.GCSFileSystem._sync_get_storage_layout", return_value=GCSFileSystem.BucketType.ZONAL_HIERARCHICAL) as mock_get_layout, \ - mock.patch("gcsfs.core.GCSAdapter.handle", return_value=(200, {}, {}, b"some data")) as mock_handle: - - gcs = gcs_factory(experimental_zb_hns_support=True) - - # This call should be routed to the gcs adapter and this test should be - # updated with real data assertions instead of mock_handle once read functionality is added in adapter - gcs.cat_file(f"{TEST_BUCKET}/some/file") - - assert gcs.bucket_type == GCSFileSystem.BucketType.ZONAL_HIERARCHICAL - mock_handle.assert_called_once() - - -def test_gcs_adapter_not_called_when_bucket_type_is_not_set(gcs): - with mock.patch("gcsfs.core.GCSAdapter.handle", - return_value=(200, {}, {}, b"some data")) as mock_handle: - - assert gcs.bucket_type == gcs.BucketType.UNKNOWN - # Use an existing file from the test setup - test_file = f"{TEST_BUCKET}/test/accounts.1.json" - - # This call should NOT be routed to the adapter - data = gcs.cat(test_file) - assert data == files["test/accounts.1.json"] - mock_handle.assert_not_called() From c9f569a18c3781e47b31f3c888a7dce1d724fe16 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 15 Oct 2025 08:41:14 +0000 Subject: [PATCH 13/59] refactors/renames GCS Filesystem Adapter --- gcsfs/core.py | 4 ++-- gcsfs/{gcsfilesystem_adapter.py => gcs_hns_filesystem.py} | 2 +- gcsfs/tests/test_core.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) rename gcsfs/{gcsfilesystem_adapter.py => gcs_hns_filesystem.py} (98%) diff --git a/gcsfs/core.py b/gcsfs/core.py index 0b03e2a5..fca16e00 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -290,8 +290,8 @@ def __new__(cls, *args, **kwargs): experimental_support = kwargs.pop('experimental_zb_hns_support', False) if experimental_support: - from .gcsfilesystem_adapter import GCSFileSystemAdapter - return object.__new__(GCSFileSystemAdapter) + from .gcs_hns_filesystem import GCSHNSFileSystem + return object.__new__(GCSHNSFileSystem) else: return object.__new__(cls) diff --git a/gcsfs/gcsfilesystem_adapter.py b/gcsfs/gcs_hns_filesystem.py similarity index 98% rename from gcsfs/gcsfilesystem_adapter.py rename to gcsfs/gcs_hns_filesystem.py index cbd0a361..7d52827f 100644 --- a/gcsfs/gcsfilesystem_adapter.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -22,7 +22,7 @@ class BucketType(Enum): None : GCSFile, } -class GCSFileSystemAdapter(GCSFileSystem): +class GCSHNSFileSystem(GCSFileSystem): """ An subclass of GCSFileSystem that will contain specialized logic for Zonal and HNS buckets. diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 0a36f9dd..2df677af 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -18,7 +18,7 @@ from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials -from gcsfs.gcsfilesystem_adapter import GCSFileSystemAdapter +from gcsfs.gcs_hns_filesystem import GCSHNSFileSystem from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files from gcsfs.tests.utils import tempdir, tmpfile @@ -1710,11 +1710,11 @@ def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory assert isinstance(gcs, gcsfs.GCSFileSystem), "Expected File system instance to be GCSFileSystem" assert not isinstance(gcs, - gcsfs.gcsfilesystem_adapter.GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystem" + gcsfs.gcs_hns_filesystem.GCSHNSFileSystem), "Expected File system instance to be GCSFileSystem" def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): gcs = gcs_factory(experimental_zb_hns_support=True) assert isinstance(gcs, - gcsfs.gcsfilesystem_adapter.GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystemAdapter" + gcsfs.gcs_hns_filesystem.GCSHNSFileSystem), "Expected File system instance to be GCSFileSystemAdapter" From e5fc9351f0f610b9bfcc9b6154e3dc84f23dcb78 Mon Sep 17 00:00:00 2001 From: suni72 Date: Wed, 15 Oct 2025 20:32:09 +0000 Subject: [PATCH 14/59] Override _cat_file to handle other read methods of gcsfs --- gcsfs/core.py | 2 +- gcsfs/gcs_hns_filesystem.py | 34 ++++++++++++++++++++++++++----- gcsfs/zonal_file.py | 40 +++++++++++++++++++++++-------------- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index fca16e00..5fb9d65e 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -284,7 +284,7 @@ class GCSFileSystem(asyn.AsyncFileSystem): def __new__(cls, *args, **kwargs): """ - Factory to return a ZonalFileSystem instance if the experimental + Factory to return a GCSHNSFileSystem instance if the experimental flag is enabled. """ experimental_support = kwargs.pop('experimental_zb_hns_support', False) diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcs_hns_filesystem.py index 7d52827f..0fc00884 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -2,31 +2,34 @@ from enum import Enum from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient from .core import GCSFileSystem, GCSFile from .zonal_file import ZonalFile -from io import BytesIO -import asyncio logger = logging.getLogger("gcsfs") + class BucketType(Enum): ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" HIERARCHICAL = "HIERARCHICAL" NON_HIERARCHICAL = "NON_HIERARCHICAL" UNKNOWN = "UNKNOWN" + gcs_file_types = { BucketType.ZONAL_HIERARCHICAL: ZonalFile, - BucketType.UNKNOWN : GCSFile, - None : GCSFile, + BucketType.UNKNOWN: GCSFile, + None: GCSFile, } + class GCSHNSFileSystem(GCSFileSystem): """ An subclass of GCSFileSystem that will contain specialized logic for Zonal and HNS buckets. """ + def __init__(self, *args, **kwargs): kwargs.pop('experimental_zb_hns_support', None) super().__init__(*args, **kwargs) @@ -53,7 +56,6 @@ async def _get_storage_layout(self, bucket): _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) - def _open( self, path, @@ -66,3 +68,25 @@ def _open( bucket, _, _ = self.split_path(path) bucket_type = self._sync_get_storage_layout(bucket) return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) + + def _process_limits(self, start, end): + # Dummy method to process start and end + if start is None: + start = 0 + if end is None: + end = 100 + return start, end - start + 1 + + async def _cat_file(self, path, start=None, end=None, **kwargs): + """ + Fetch a file's contents as bytes. + """ + mrd = kwargs.pop("mrd", None) + if mrd is None: + if self.grpc_client is None: + self.grpc_client = AsyncGrpcClient().grpc_client + bucket, object_name, generation = self.split_path(path) + mrd = await ZonalFile._create_mrd(self.grpc_client, bucket, object_name, generation) + + offset, length = self._process_limits(start, end) + return await ZonalFile.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index ad1d1122..b4d5297f 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -1,46 +1,56 @@ -from .core import GCSFile +from io import BytesIO + from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient -from io import BytesIO +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader + +from .core import GCSFile + class ZonalFile(GCSFile): """ GCSFile subclass designed to handle reads from Zonal buckets using a high-performance gRPC path. """ + def __init__(self, *args, **kwargs): """ Initializes the ZonalFile object. """ super().__init__(*args, **kwargs) - self.mrd = asyn.sync(self.gcsfs.loop, self._get_downloader, self.bucket, self.key, self.generation) + self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) + + @classmethod + async def _create_mrd(cls, grpc_client, bucket_name, object_name, generation=None): + """ + Creates the AsyncMultiRangeDownloader. + """ + mrd = await AsyncMultiRangeDownloader.create_mrd( + grpc_client, bucket_name, object_name, generation + ) + return mrd - async def _get_downloader(self, bucket_name, object_name, generation=None): + async def _init_mrd(self, bucket_name, object_name, generation=None): """ Initializes the AsyncMultiRangeDownloader. """ if self.gcsfs.grpc_client is None: self.gcsfs.grpc_client = AsyncGrpcClient().grpc_client - downloader = await AsyncMultiRangeDownloader.create_mrd( - self.gcsfs.grpc_client, bucket_name, object_name, generation - ) - return downloader + return await self._create_mrd(self.gcsfs.grpc_client, bucket_name, object_name, generation) - async def download_range(self,path, start, end): + @classmethod + async def download_range(cls, offset, length, mrd): """ Downloads a byte range from the file asynchronously. """ - offset = start - length = end - start + 1 buffer = BytesIO() - results = await self.mrd.download_ranges([(offset, length, buffer)]) + await mrd.download_ranges([(offset, length, buffer)]) return buffer.getvalue() def _fetch_range(self, start, end): """ Overrides the default _fetch_range to implement the gRPC read path. - """ - return asyn.sync(self.gcsfs.loop, self.download_range, self.path, start, end) + """ + return self.gcsfs.cat_file(self.path, start=start, end=end, mrd=self.mrd) From 50d30b3763ba3bd4c93de336ff164894b8459a5e Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Thu, 16 Oct 2025 09:05:02 +0000 Subject: [PATCH 15/59] creates grpc client inside GCSHNSFilesystem --- gcsfs/gcs_hns_filesystem.py | 11 +++++++++-- gcsfs/zonal_file.py | 3 --- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcs_hns_filesystem.py index 0fc00884..ab4bc19f 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -27,15 +27,22 @@ class BucketType(Enum): class GCSHNSFileSystem(GCSFileSystem): """ An subclass of GCSFileSystem that will contain specialized - logic for Zonal and HNS buckets. + logic for HNS Filesystem. """ def __init__(self, *args, **kwargs): kwargs.pop('experimental_zb_hns_support', None) super().__init__(*args, **kwargs) - self.grpc_client = None + self.grpc_client=None + self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) self._storage_layout_cache = {} + async def _create_grpc_client(self): + if self.grpc_client is None: + return AsyncGrpcClient().grpc_client + else: + return self.grpc_client + async def _get_storage_layout(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index b4d5297f..d8ead768 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -34,9 +34,6 @@ async def _init_mrd(self, bucket_name, object_name, generation=None): """ Initializes the AsyncMultiRangeDownloader. """ - if self.gcsfs.grpc_client is None: - self.gcsfs.grpc_client = AsyncGrpcClient().grpc_client - return await self._create_mrd(self.gcsfs.grpc_client, bucket_name, object_name, generation) @classmethod From cbf00b3d351b4da9cc934663565fc64f04dc2bf0 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Thu, 16 Oct 2025 11:11:02 +0000 Subject: [PATCH 16/59] refactors to reuse grpc client in cat_ranges --- gcsfs/gcs_hns_filesystem.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcs_hns_filesystem.py index ab4bc19f..6f7b33f1 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -90,8 +90,6 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): """ mrd = kwargs.pop("mrd", None) if mrd is None: - if self.grpc_client is None: - self.grpc_client = AsyncGrpcClient().grpc_client bucket, object_name, generation = self.split_path(path) mrd = await ZonalFile._create_mrd(self.grpc_client, bucket, object_name, generation) From a202579408d4623b5af0845b8df9b9057bfbd930 Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Fri, 17 Oct 2025 16:02:34 +0000 Subject: [PATCH 17/59] Move classmethods in ZonalFile to a util file --- gcsfs/gcs_hns_filesystem.py | 10 ++++++---- gcsfs/zb_hns_utils.py | 22 ++++++++++++++++++++++ gcsfs/zonal_file.py | 26 ++------------------------ 3 files changed, 30 insertions(+), 28 deletions(-) create mode 100644 gcsfs/zb_hns_utils.py diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcs_hns_filesystem.py index 6f7b33f1..cc839348 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -4,6 +4,7 @@ from fsspec import asyn from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from . import zb_hns_utils from .core import GCSFileSystem, GCSFile from .zonal_file import ZonalFile @@ -19,8 +20,9 @@ class BucketType(Enum): gcs_file_types = { BucketType.ZONAL_HIERARCHICAL: ZonalFile, + BucketType.NON_HIERARCHICAL: GCSFile, + BucketType.HIERARCHICAL: GCSFile, BucketType.UNKNOWN: GCSFile, - None: GCSFile, } @@ -33,7 +35,7 @@ class GCSHNSFileSystem(GCSFileSystem): def __init__(self, *args, **kwargs): kwargs.pop('experimental_zb_hns_support', None) super().__init__(*args, **kwargs) - self.grpc_client=None + self.grpc_client = None self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) self._storage_layout_cache = {} @@ -91,7 +93,7 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): mrd = kwargs.pop("mrd", None) if mrd is None: bucket, object_name, generation = self.split_path(path) - mrd = await ZonalFile._create_mrd(self.grpc_client, bucket, object_name, generation) + mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) offset, length = self._process_limits(start, end) - return await ZonalFile.download_range(offset=offset, length=length, mrd=mrd) + return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py new file mode 100644 index 00000000..0ad52a20 --- /dev/null +++ b/gcsfs/zb_hns_utils.py @@ -0,0 +1,22 @@ +from io import BytesIO + +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader + + +async def create_mrd(grpc_client, bucket_name, object_name, generation=None): + """ + Creates the AsyncMultiRangeDownloader. + """ + mrd = await AsyncMultiRangeDownloader.create_mrd( + grpc_client, bucket_name, object_name, generation + ) + return mrd + + +async def download_range(offset, length, mrd): + """ + Downloads a byte range from the file asynchronously. + """ + buffer = BytesIO() + await mrd.download_ranges([(offset, length, buffer)]) + return buffer.getvalue() diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index d8ead768..a9eaad95 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -1,9 +1,6 @@ -from io import BytesIO - from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader +from . import zb_hns_utils from .core import GCSFile @@ -20,30 +17,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) - @classmethod - async def _create_mrd(cls, grpc_client, bucket_name, object_name, generation=None): - """ - Creates the AsyncMultiRangeDownloader. - """ - mrd = await AsyncMultiRangeDownloader.create_mrd( - grpc_client, bucket_name, object_name, generation - ) - return mrd - async def _init_mrd(self, bucket_name, object_name, generation=None): """ Initializes the AsyncMultiRangeDownloader. """ - return await self._create_mrd(self.gcsfs.grpc_client, bucket_name, object_name, generation) - - @classmethod - async def download_range(cls, offset, length, mrd): - """ - Downloads a byte range from the file asynchronously. - """ - buffer = BytesIO() - await mrd.download_ranges([(offset, length, buffer)]) - return buffer.getvalue() + return await zb_hns_utils.create_mrd(self.gcsfs.grpc_client, bucket_name, object_name, generation) def _fetch_range(self, start, end): """ From 5064a97bf72fb4929a8120ea5ab33b8e6421cd4f Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Sun, 19 Oct 2025 16:51:57 +0000 Subject: [PATCH 18/59] Implement logic to process limits as offset and length --- gcsfs/gcs_hns_filesystem.py | 60 +++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcs_hns_filesystem.py index cc839348..94a38e14 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcs_hns_filesystem.py @@ -78,13 +78,57 @@ def _open( bucket_type = self._sync_get_storage_layout(bucket) return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) - def _process_limits(self, start, end): - # Dummy method to process start and end + async def process_limits_to_offset_and_length(self, path, start, end): + """ + Calculates the read offset and length from start and end parameters. + + Args: + path (str): The path to the file. + start (int | None): The starting byte position. + end (int | None): The ending byte position. + + Returns: + tuple: A tuple containing (offset, length). + + Raises: + ValueError: If the calculated range is invalid. + """ + size = None + if start is None: - start = 0 + offset = 0 + elif start < 0: + size = size or (await self._info(path))["size"] + offset = size + start + else: + offset = start + if end is None: - end = 100 - return start, end - start + 1 + size = size or (await self._info(path))["size"] + effective_end = size + elif end < 0: + size = size or (await self._info(path))["size"] + effective_end = size + end + else: + effective_end = end + + size = size or (await self._info(path))["size"] + if offset < 0: + raise ValueError(f"Calculated start offset ({offset}) cannot be negative.") + if effective_end < offset: + raise ValueError(f"Calculated end position ({effective_end}) cannot be before start offset ({offset}).") + elif effective_end == offset: + length = 0 # Handle zero-length slice + elif effective_end > size: + length = size - offset # Clamp length to file size + else: + length = effective_end - offset # Normal case + + if offset + length > size: + # This might happen with large positive end values + length = max(0, size - offset) + + return offset, length async def _cat_file(self, path, start=None, end=None, **kwargs): """ @@ -95,5 +139,9 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): bucket, object_name, generation = self.split_path(path) mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) - offset, length = self._process_limits(start, end) + offset, length = await self.process_limits_to_offset_and_length(path, start, end) + # If length = 0, mrd returns till end of file, so handle that case here + if length == 0: + return b"" + return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) From ac276f60a45a0635f367ab037e7403e0735c4d09 Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Sun, 19 Oct 2025 18:17:06 +0000 Subject: [PATCH 19/59] Add fallback logic for non zonal buckets Rename gcshnsfilesystem to GCSFileSystemAdapter --- gcsfs/core.py | 4 ++-- gcsfs/{gcs_hns_filesystem.py => gcsfs_adapter.py} | 12 ++++++++++-- gcsfs/tests/test_core.py | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) rename gcsfs/{gcs_hns_filesystem.py => gcsfs_adapter.py} (91%) diff --git a/gcsfs/core.py b/gcsfs/core.py index 5fb9d65e..e43b30fc 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -290,8 +290,8 @@ def __new__(cls, *args, **kwargs): experimental_support = kwargs.pop('experimental_zb_hns_support', False) if experimental_support: - from .gcs_hns_filesystem import GCSHNSFileSystem - return object.__new__(GCSHNSFileSystem) + from .gcsfs_adapter import GCSFileSystemAdapter + return object.__new__(GCSFileSystemAdapter) else: return object.__new__(cls) diff --git a/gcsfs/gcs_hns_filesystem.py b/gcsfs/gcsfs_adapter.py similarity index 91% rename from gcsfs/gcs_hns_filesystem.py rename to gcsfs/gcsfs_adapter.py index 94a38e14..216e99ae 100644 --- a/gcsfs/gcs_hns_filesystem.py +++ b/gcsfs/gcsfs_adapter.py @@ -26,10 +26,10 @@ class BucketType(Enum): } -class GCSHNSFileSystem(GCSFileSystem): +class GCSFileSystemAdapter(GCSFileSystem): """ An subclass of GCSFileSystem that will contain specialized - logic for HNS Filesystem. + logic for ZB and HNS buckets. """ def __init__(self, *args, **kwargs): @@ -130,6 +130,10 @@ async def process_limits_to_offset_and_length(self, path, start, end): return offset, length + async def is_zonal_bucket(self, bucket): + layout = await self._get_storage_layout(bucket) + return layout == BucketType.ZONAL_HIERARCHICAL + async def _cat_file(self, path, start=None, end=None, **kwargs): """ Fetch a file's contents as bytes. @@ -137,6 +141,10 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): mrd = kwargs.pop("mrd", None) if mrd is None: bucket, object_name, generation = self.split_path(path) + # Fall back to default implementation if not a zonal bucket + if not await self.is_zonal_bucket(bucket): + return await super()._cat_file(path, start=start, end=end, **kwargs) + mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) offset, length = await self.process_limits_to_offset_and_length(path, start, end) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 2df677af..442178cc 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -18,7 +18,7 @@ from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials -from gcsfs.gcs_hns_filesystem import GCSHNSFileSystem +from gcsfs.gcsfs_adapter import GCSFileSystemAdapter from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files from gcsfs.tests.utils import tempdir, tmpfile @@ -1710,11 +1710,11 @@ def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory assert isinstance(gcs, gcsfs.GCSFileSystem), "Expected File system instance to be GCSFileSystem" assert not isinstance(gcs, - gcsfs.gcs_hns_filesystem.GCSHNSFileSystem), "Expected File system instance to be GCSFileSystem" + GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystem" def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): gcs = gcs_factory(experimental_zb_hns_support=True) assert isinstance(gcs, - gcsfs.gcs_hns_filesystem.GCSHNSFileSystem), "Expected File system instance to be GCSFileSystemAdapter" + GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystemAdapter" From dbecb1281f57dbf38686d0e7c865351d3855668d Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 20 Oct 2025 20:05:18 +0000 Subject: [PATCH 20/59] Add unit tests for zb_hns_utils --- gcsfs/tests/test_zb_hns_utils.py | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 gcsfs/tests/test_zb_hns_utils.py diff --git a/gcsfs/tests/test_zb_hns_utils.py b/gcsfs/tests/test_zb_hns_utils.py new file mode 100644 index 00000000..23682b85 --- /dev/null +++ b/gcsfs/tests/test_zb_hns_utils.py @@ -0,0 +1,57 @@ +from io import BytesIO +from unittest import mock + +import pytest + +from gcsfs import zb_hns_utils + + +@pytest.mark.asyncio +async def test_create_mrd(): + """ + Tests that create_mrd calls the underlying AsyncMultiRangeDownloader.create_mrd + with the correct arguments and returns its result. + """ + mock_grpc_client = mock.Mock() + bucket_name = "test-bucket" + object_name = "test-object" + generation = "12345" + mock_mrd_instance = mock.AsyncMock() + + with mock.patch( + "gcsfs.zb_hns_utils.AsyncMultiRangeDownloader.create_mrd", + new_callable=mock.AsyncMock, + return_value=mock_mrd_instance, + ) as mock_create: + result = await zb_hns_utils.create_mrd( + mock_grpc_client, bucket_name, object_name, generation + ) + + mock_create.assert_called_once_with( + mock_grpc_client, bucket_name, object_name, generation + ) + assert result is mock_mrd_instance + + +@pytest.mark.asyncio +async def test_download_range(): + """ + Tests that download_range calls mrd.download_ranges with the correct + parameters and returns the data written to the buffer. + """ + offset = 10 + length = 20 + mock_mrd = mock.AsyncMock() + expected_data = b"test data from download" + + # Simulate the download_ranges method writing data to the buffer + async def mock_download_ranges(ranges): + _offset, _length, buffer = ranges[0] + buffer.write(expected_data) + + mock_mrd.download_ranges.side_effect = mock_download_ranges + + result = await zb_hns_utils.download_range(offset, length, mock_mrd) + + mock_mrd.download_ranges.assert_called_once_with([(offset, length, mock.ANY)]) + assert result == expected_data \ No newline at end of file From 37a495b9f8bf9c078a96f5ab720e495860335f1e Mon Sep 17 00:00:00 2001 From: suni72 Date: Tue, 21 Oct 2025 18:59:10 +0000 Subject: [PATCH 21/59] Add test for GCSFSAdapter with read_block --- gcsfs/tests/conftest.py | 22 +++++++++++ gcsfs/tests/test_gcsfs_adapter.py | 63 +++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 86 insertions(+) create mode 100644 gcsfs/tests/test_gcsfs_adapter.py diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 57f605b5..00025956 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -123,6 +123,28 @@ def gcs(gcs_factory, populate=True): except: # noqa: E722 pass +@pytest.fixture +def gcs_adapter(gcs_factory, populate=True): + gcs_adapter = gcs_factory(experimental_zb_hns_support=True) + try: + try: + gcs_adapter.rm(TEST_BUCKET, recursive=True) + except FileNotFoundError: + pass + try: + gcs_adapter.mkdir(TEST_BUCKET) + except Exception: + pass + if populate: + gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) + gcs_adapter.invalidate_cache() + yield gcs_adapter + finally: + try: + gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) + gcs_adapter.rm(TEST_BUCKET) + except Exception: + pass @pytest.fixture def gcs_versioned(gcs_factory): diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py new file mode 100644 index 00000000..20a36568 --- /dev/null +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -0,0 +1,63 @@ +import io +from unittest import mock + +import pytest +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader + +from gcsfs.gcsfs_adapter import BucketType +from gcsfs.tests.conftest import files +from gcsfs.tests.settings import TEST_BUCKET + +file = "test/accounts.1.json" +file_path = f"{TEST_BUCKET}/{file}" +data = files[file] +lines = io.BytesIO(data).readlines() +file_size = len(data) + +read_block_params = [ + # Read specific chunk + pytest.param(3, 10, None, data[3:3 + 10], id="offset=3, length=10"), + # Read from beginning up to length + pytest.param(0, 5, None, data[0:5], id="offset=0, length=5"), + # Read from offset to end (simulate large length) + pytest.param(15, 5000, None, data[15:], id="offset=15, length=large"), + # Read beyond end of file (should return empty bytes) + pytest.param(file_size + 10, 5, None, b"", id="offset>size, length=5"), + # Read exactly at the end (zero length effective) + pytest.param(file_size, 10, None, b"", id="offset=size, length=10"), + # Read with delimiter + pytest.param(1, 35, b'\n', lines[1], id="offset=1, length=35, delimiter=newline"), + pytest.param(0, 30, b'\n', lines[0], id="offset=0, length=35, delimiter=newline"), + pytest.param(0, 35, b'\n', lines[0] + lines[1], id="offset=0, length=35, delimiter=newline"), +] + + +@pytest.mark.parametrize("offset, length, delimiter, expected_data", read_block_params) +def test_read_block_zonal_path(gcs_adapter, offset, length, delimiter, expected_data): + path = file_path + + async def mock_download_ranges_side_effect(read_requests, **kwargs): + if read_requests and len(read_requests) == 1: + param_offset, param_length, buffer_arg = read_requests[0] + if hasattr(buffer_arg, 'write'): + buffer_arg.write(data[param_offset:param_offset + param_length]) + return [mock.Mock(error=None)] + + patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + mock_sync_get_layout = mock.Mock(return_value=BucketType.ZONAL_HIERARCHICAL) + + patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" + mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) + mock_downloader.download_ranges = mock.AsyncMock(side_effect=mock_download_ranges_side_effect) + mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) + + with mock.patch(patch_target_sync_layout, mock_sync_get_layout), \ + mock.patch(patch_target_create_mrd, mock_create_mrd): + result = gcs_adapter.read_block(path, offset, length, delimiter) + + assert result == expected_data + mock_sync_get_layout.assert_called_once_with(TEST_BUCKET) + if expected_data: + mock_downloader.download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) + else: + mock_downloader.download_ranges.assert_not_called() diff --git a/requirements.txt b/requirements.txt index d8a13c68..448e8b3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ google-auth>=1.2 google-auth-oauthlib google-cloud-storage requests +pytest \ No newline at end of file From cba7d274245fcb458b2669bab37afdfd41e37835 Mon Sep 17 00:00:00 2001 From: suni72 Date: Fri, 24 Oct 2025 11:35:15 +0000 Subject: [PATCH 22/59] Update _get_storage_layout to use a single return statement --- gcsfs/gcsfs_adapter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 216e99ae..0cf6d72d 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -51,17 +51,15 @@ async def _get_storage_layout(self, bucket): try: response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) if response.get("locationType") == "zone": - bucket_type = BucketType.ZONAL_HIERARCHICAL + self._storage_layout_cache[bucket] = BucketType.ZONAL_HIERARCHICAL else: # This should be updated to include HNS in the future - bucket_type = BucketType.NON_HIERARCHICAL - self._storage_layout_cache[bucket] = bucket_type - return bucket_type + self._storage_layout_cache[bucket] = BucketType.NON_HIERARCHICAL except Exception as e: logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") # Default to UNKNOWN self._storage_layout_cache[bucket] = BucketType.UNKNOWN - return BucketType.UNKNOWN + return self._storage_layout_cache[bucket] _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) From 4ff9fe4adfba417e8348bfe735d3ef29717fe9e3 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 05:30:20 +0000 Subject: [PATCH 23/59] Updated gcs_adapter.open to pass on correct default values to GCSFile --- gcsfs/gcsfs_adapter.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 0cf6d72d..05a36bc1 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -67,6 +67,14 @@ def _open( self, path, mode="rb", + block_size=None, + cache_options=None, + acl=None, + consistency=None, + metadata=None, + autocommit=True, + fixed_key_metadata=None, + generation=None, **kwargs, ): """ @@ -74,7 +82,16 @@ def _open( """ bucket, _, _ = self.split_path(path) bucket_type = self._sync_get_storage_layout(bucket) - return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) + return gcs_file_types[bucket_type](self, path, + mode, + block_size, + cache_options=cache_options, + consistency=consistency, + metadata=metadata, + acl=acl, + autocommit=autocommit, + fixed_key_metadata=fixed_key_metadata, + **kwargs,) async def process_limits_to_offset_and_length(self, path, start, end): """ From 389a4b017ed6e5727a973a390040f3f8363604c9 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 05:33:45 +0000 Subject: [PATCH 24/59] Move logic for handing 0 length in MRD to zb_hns_utils --- gcsfs/gcsfs_adapter.py | 4 ---- gcsfs/zb_hns_utils.py | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 05a36bc1..a296fc30 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -163,8 +163,4 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) offset, length = await self.process_limits_to_offset_and_length(path, start, end) - # If length = 0, mrd returns till end of file, so handle that case here - if length == 0: - return b"" - return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py index 0ad52a20..2fb8dfbe 100644 --- a/gcsfs/zb_hns_utils.py +++ b/gcsfs/zb_hns_utils.py @@ -17,6 +17,9 @@ async def download_range(offset, length, mrd): """ Downloads a byte range from the file asynchronously. """ + # If length = 0, mrd returns till end of file, so handle that case here + if length == 0: + return b"" buffer = BytesIO() await mrd.download_ranges([(offset, length, buffer)]) return buffer.getvalue() From 133b4fac58bf6beda74c65acba7f6403e3501359 Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Mon, 27 Oct 2025 07:29:44 +0000 Subject: [PATCH 25/59] Add comments for clarity --- gcsfs/gcsfs_adapter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 0cf6d72d..b7dd738b 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -76,6 +76,7 @@ def _open( bucket_type = self._sync_get_storage_layout(bucket) return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) + # Replacement method for _process_limits to support new params (offset and length) for MRD. async def process_limits_to_offset_and_length(self, path, start, end): """ Calculates the read offset and length from start and end parameters. @@ -137,6 +138,9 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): Fetch a file's contents as bytes. """ mrd = kwargs.pop("mrd", None) + + # A new MRD is required when read is done directly by the + # GCSFilesystem class without creating a GCSFile object first. if mrd is None: bucket, object_name, generation = self.split_path(path) # Fall back to default implementation if not a zonal bucket From e36b7edb9d55435b52b5be3f722b09064ceb3e01 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 14:44:35 +0000 Subject: [PATCH 26/59] Updated zonal file to only create mrd for read mode. --- gcsfs/zonal_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index a9eaad95..9059fd7c 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -15,7 +15,9 @@ def __init__(self, *args, **kwargs): Initializes the ZonalFile object. """ super().__init__(*args, **kwargs) - self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) + self.mrd = None + if "r" in self.mode: + self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) async def _init_mrd(self, bucket_name, object_name, generation=None): """ From 25cd0ef5ddc6354b800606e7fbb68a67e782fa65 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 15:01:02 +0000 Subject: [PATCH 27/59] Updated gcs_adapter fixture to not setup bucket when real gcs endpoint being used. Updated zonal tests to work with both real and fake gcs Added more tests for zonal path reads --- gcsfs/tests/conftest.py | 30 +++-- gcsfs/tests/test_gcsfs_adapter.py | 179 +++++++++++++++++++++++++----- 2 files changed, 167 insertions(+), 42 deletions(-) diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 00025956..d94ceaa9 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -126,23 +126,29 @@ def gcs(gcs_factory, populate=True): @pytest.fixture def gcs_adapter(gcs_factory, populate=True): gcs_adapter = gcs_factory(experimental_zb_hns_support=True) + # Check if we are running against a real GCS endpoint + is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" try: - try: - gcs_adapter.rm(TEST_BUCKET, recursive=True) - except FileNotFoundError: - pass - try: - gcs_adapter.mkdir(TEST_BUCKET) - except Exception: - pass - if populate: - gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) + # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint + if not is_real_gcs: + try: + gcs_adapter.rm(TEST_BUCKET, recursive=True) + except FileNotFoundError: + pass + try: + gcs_adapter.mkdir(TEST_BUCKET) + except Exception: + pass + if populate: + gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) gcs_adapter.invalidate_cache() yield gcs_adapter finally: try: - gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) - gcs_adapter.rm(TEST_BUCKET) + # Only remove the bucket/contents if we are NOT using the real GCS + if not is_real_gcs: + gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) + gcs_adapter.rm(TEST_BUCKET) except Exception: pass diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 20a36568..a5148785 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -1,29 +1,75 @@ +import contextlib import io +import os +from itertools import chain from unittest import mock import pytest from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader from gcsfs.gcsfs_adapter import BucketType -from gcsfs.tests.conftest import files +from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files from gcsfs.tests.settings import TEST_BUCKET file = "test/accounts.1.json" file_path = f"{TEST_BUCKET}/{file}" -data = files[file] -lines = io.BytesIO(data).readlines() -file_size = len(data) +json_data = files[file] +lines = io.BytesIO(json_data).readlines() +file_size = len(json_data) + + +@pytest.fixture +def zonal_mocks(): + """A factory fixture for mocking Zonal bucket functionality.""" + + @contextlib.contextmanager + def _zonal_mocks_factory(file_data): + """Creates mocks for a given file content.""" + is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + if is_real_gcs: + yield None + return + patch_target_get_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_storage_layout" + patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" + patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" + + async def download_side_effect(read_requests, **kwargs): + if read_requests and len(read_requests) == 1: + param_offset, param_length, buffer_arg = read_requests[0] + if hasattr(buffer_arg, 'write'): + buffer_arg.write(file_data[param_offset:param_offset + param_length]) + return [mock.Mock(error=None)] + + mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) + mock_downloader.download_ranges = mock.AsyncMock(side_effect=download_side_effect) + + mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) + with mock.patch(patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL) as mock_sync_layout, \ + mock.patch(patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL), \ + mock.patch(patch_target_create_mrd, mock_create_mrd), \ + mock.patch(patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock) as mock_cat_file: + mocks = {"sync_layout": mock_sync_layout, + "create_mrd": mock_create_mrd, + "downloader": mock_downloader, + "cat_file": mock_cat_file} + yield mocks + # Common assertion for all tests using this mock + mock_cat_file.assert_not_called() + + yield _zonal_mocks_factory + read_block_params = [ # Read specific chunk - pytest.param(3, 10, None, data[3:3 + 10], id="offset=3, length=10"), + pytest.param(3, 10, None, json_data[3:3 + 10], id="offset=3, length=10"), # Read from beginning up to length - pytest.param(0, 5, None, data[0:5], id="offset=0, length=5"), + pytest.param(0, 5, None, json_data[0:5], id="offset=0, length=5"), # Read from offset to end (simulate large length) - pytest.param(15, 5000, None, data[15:], id="offset=15, length=large"), + pytest.param(15, 5000, None, json_data[15:], id="offset=15, length=large"), # Read beyond end of file (should return empty bytes) pytest.param(file_size + 10, 5, None, b"", id="offset>size, length=5"), - # Read exactly at the end (zero length effective) + # Read exactly at the end (zero length) pytest.param(file_size, 10, None, b"", id="offset=size, length=10"), # Read with delimiter pytest.param(1, 35, b'\n', lines[1], id="offset=1, length=35, delimiter=newline"), @@ -33,31 +79,104 @@ @pytest.mark.parametrize("offset, length, delimiter, expected_data", read_block_params) -def test_read_block_zonal_path(gcs_adapter, offset, length, delimiter, expected_data): +def test_read_block_zb(gcs_adapter, zonal_mocks, offset, length, delimiter, expected_data): path = file_path - async def mock_download_ranges_side_effect(read_requests, **kwargs): - if read_requests and len(read_requests) == 1: - param_offset, param_length, buffer_arg = read_requests[0] - if hasattr(buffer_arg, 'write'): - buffer_arg.write(data[param_offset:param_offset + param_length]) - return [mock.Mock(error=None)] + with zonal_mocks(json_data) as mocks: + result = gcs_adapter.read_block(path, offset, length, delimiter) - patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" - mock_sync_get_layout = mock.Mock(return_value=BucketType.ZONAL_HIERARCHICAL) + assert result == expected_data + if mocks: + mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + if expected_data: + mocks["downloader"].download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) + else: + mocks["downloader"].download_ranges.assert_not_called() - patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" - mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) - mock_downloader.download_ranges = mock.AsyncMock(side_effect=mock_download_ranges_side_effect) - mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) - with mock.patch(patch_target_sync_layout, mock_sync_get_layout), \ - mock.patch(patch_target_create_mrd, mock_create_mrd): - result = gcs_adapter.read_block(path, offset, length, delimiter) +def test_read_small_zb(gcs_adapter, zonal_mocks): + csv_file = "2014-01-01.csv" + csv_file_path = f"{TEST_BUCKET}/{csv_file}" + csv_data = csv_files[csv_file] - assert result == expected_data - mock_sync_get_layout.assert_called_once_with(TEST_BUCKET) - if expected_data: - mock_downloader.download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) - else: - mock_downloader.download_ranges.assert_not_called() + with zonal_mocks(csv_data) as mocks: + with gcs_adapter.open(csv_file_path, "rb", block_size=10) as f: + out = [] + i = 1 + while True: + i += 1 + data = f.read(3) + if data == b"": + break + out.append(data) + assert gcs_adapter.cat(csv_file_path) == b"".join(out) + # cache drop + assert len(f.cache.cache) < len(out) + if mocks: + mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + + +def test_readline_zb(gcs_adapter, zonal_mocks): + all_items = chain.from_iterable( + [files.items(), csv_files.items(), text_files.items()] + ) + for k, data in all_items: + with zonal_mocks(data) as mocks: + with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: + result = f.readline() + expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") + assert result == expected + + +def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): + data = b"a,b\n11,22\n3,4" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(a, "wb") as f: + f.write(data) + with gcs_adapter.open(a, "rb") as f: + result = f.readline() + assert result == b"a,b\n" + assert f.loc == 4 + assert f.cache.cache == data + + result = f.readline() + assert result == b"11,22\n" + assert f.loc == 10 + assert f.cache.cache == data + + result = f.readline() + assert result == b"3,4" + assert f.loc == 13 + assert f.cache.cache == data + + +def test_readline_empty_zb(gcs_adapter, zonal_mocks): + data = b"" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(b, "wb") as f: + f.write(data) + with gcs_adapter.open(b, "rb") as f: + result = f.readline() + assert result == data + + +def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): + data = b"ab\n" + b"a" * (2 ** 18) + b"\nab" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(c, "wb") as f: + f.write(data) + with gcs_adapter.open(c, "rb", block_size=2 ** 18) as f: + result = f.readline() + expected = b"ab\n" + assert result == expected + + result = f.readline() + expected = b"a" * (2 ** 18) + b"\n" + assert result == expected + + result = f.readline() + expected = b"ab" + assert result == expected From 84254642c6d810f807818f218ec48271763889b3 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 05:30:20 +0000 Subject: [PATCH 28/59] Updated gcs_adapter.open to pass on correct default values to GCSFile --- gcsfs/gcsfs_adapter.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index b7dd738b..f443e9b4 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -67,6 +67,14 @@ def _open( self, path, mode="rb", + block_size=None, + cache_options=None, + acl=None, + consistency=None, + metadata=None, + autocommit=True, + fixed_key_metadata=None, + generation=None, **kwargs, ): """ @@ -74,7 +82,16 @@ def _open( """ bucket, _, _ = self.split_path(path) bucket_type = self._sync_get_storage_layout(bucket) - return gcs_file_types[bucket_type](gcsfs=self, path=path, mode=mode, **kwargs) + return gcs_file_types[bucket_type](self, path, + mode, + block_size, + cache_options=cache_options, + consistency=consistency, + metadata=metadata, + acl=acl, + autocommit=autocommit, + fixed_key_metadata=fixed_key_metadata, + **kwargs,) # Replacement method for _process_limits to support new params (offset and length) for MRD. async def process_limits_to_offset_and_length(self, path, start, end): From 75fecce7ba8a2f227b9826c08d3678219ecfc6be Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 05:33:45 +0000 Subject: [PATCH 29/59] Move logic for handing 0 length in MRD to zb_hns_utils --- gcsfs/gcsfs_adapter.py | 4 ---- gcsfs/zb_hns_utils.py | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index f443e9b4..b5d66108 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -167,8 +167,4 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) offset, length = await self.process_limits_to_offset_and_length(path, start, end) - # If length = 0, mrd returns till end of file, so handle that case here - if length == 0: - return b"" - return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py index 0ad52a20..2fb8dfbe 100644 --- a/gcsfs/zb_hns_utils.py +++ b/gcsfs/zb_hns_utils.py @@ -17,6 +17,9 @@ async def download_range(offset, length, mrd): """ Downloads a byte range from the file asynchronously. """ + # If length = 0, mrd returns till end of file, so handle that case here + if length == 0: + return b"" buffer = BytesIO() await mrd.download_ranges([(offset, length, buffer)]) return buffer.getvalue() From 2b6af9cd99c71a464b16703bbdf1846566889a14 Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 14:44:35 +0000 Subject: [PATCH 30/59] Updated zonal file to only create mrd for read mode. --- gcsfs/zonal_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index a9eaad95..9059fd7c 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -15,7 +15,9 @@ def __init__(self, *args, **kwargs): Initializes the ZonalFile object. """ super().__init__(*args, **kwargs) - self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) + self.mrd = None + if "r" in self.mode: + self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) async def _init_mrd(self, bucket_name, object_name, generation=None): """ From b648df479c7e18e61e8674f29982d6b670b8434d Mon Sep 17 00:00:00 2001 From: suni72 Date: Mon, 27 Oct 2025 15:01:02 +0000 Subject: [PATCH 31/59] Updated gcs_adapter fixture to not setup bucket when real gcs endpoint being used. Updated zonal tests to work with both real and fake gcs Added more tests for zonal path reads --- gcsfs/tests/conftest.py | 30 +++-- gcsfs/tests/test_gcsfs_adapter.py | 179 +++++++++++++++++++++++++----- 2 files changed, 167 insertions(+), 42 deletions(-) diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 00025956..d94ceaa9 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -126,23 +126,29 @@ def gcs(gcs_factory, populate=True): @pytest.fixture def gcs_adapter(gcs_factory, populate=True): gcs_adapter = gcs_factory(experimental_zb_hns_support=True) + # Check if we are running against a real GCS endpoint + is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" try: - try: - gcs_adapter.rm(TEST_BUCKET, recursive=True) - except FileNotFoundError: - pass - try: - gcs_adapter.mkdir(TEST_BUCKET) - except Exception: - pass - if populate: - gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) + # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint + if not is_real_gcs: + try: + gcs_adapter.rm(TEST_BUCKET, recursive=True) + except FileNotFoundError: + pass + try: + gcs_adapter.mkdir(TEST_BUCKET) + except Exception: + pass + if populate: + gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) gcs_adapter.invalidate_cache() yield gcs_adapter finally: try: - gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) - gcs_adapter.rm(TEST_BUCKET) + # Only remove the bucket/contents if we are NOT using the real GCS + if not is_real_gcs: + gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) + gcs_adapter.rm(TEST_BUCKET) except Exception: pass diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 20a36568..a5148785 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -1,29 +1,75 @@ +import contextlib import io +import os +from itertools import chain from unittest import mock import pytest from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader from gcsfs.gcsfs_adapter import BucketType -from gcsfs.tests.conftest import files +from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files from gcsfs.tests.settings import TEST_BUCKET file = "test/accounts.1.json" file_path = f"{TEST_BUCKET}/{file}" -data = files[file] -lines = io.BytesIO(data).readlines() -file_size = len(data) +json_data = files[file] +lines = io.BytesIO(json_data).readlines() +file_size = len(json_data) + + +@pytest.fixture +def zonal_mocks(): + """A factory fixture for mocking Zonal bucket functionality.""" + + @contextlib.contextmanager + def _zonal_mocks_factory(file_data): + """Creates mocks for a given file content.""" + is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + if is_real_gcs: + yield None + return + patch_target_get_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_storage_layout" + patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" + patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" + + async def download_side_effect(read_requests, **kwargs): + if read_requests and len(read_requests) == 1: + param_offset, param_length, buffer_arg = read_requests[0] + if hasattr(buffer_arg, 'write'): + buffer_arg.write(file_data[param_offset:param_offset + param_length]) + return [mock.Mock(error=None)] + + mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) + mock_downloader.download_ranges = mock.AsyncMock(side_effect=download_side_effect) + + mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) + with mock.patch(patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL) as mock_sync_layout, \ + mock.patch(patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL), \ + mock.patch(patch_target_create_mrd, mock_create_mrd), \ + mock.patch(patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock) as mock_cat_file: + mocks = {"sync_layout": mock_sync_layout, + "create_mrd": mock_create_mrd, + "downloader": mock_downloader, + "cat_file": mock_cat_file} + yield mocks + # Common assertion for all tests using this mock + mock_cat_file.assert_not_called() + + yield _zonal_mocks_factory + read_block_params = [ # Read specific chunk - pytest.param(3, 10, None, data[3:3 + 10], id="offset=3, length=10"), + pytest.param(3, 10, None, json_data[3:3 + 10], id="offset=3, length=10"), # Read from beginning up to length - pytest.param(0, 5, None, data[0:5], id="offset=0, length=5"), + pytest.param(0, 5, None, json_data[0:5], id="offset=0, length=5"), # Read from offset to end (simulate large length) - pytest.param(15, 5000, None, data[15:], id="offset=15, length=large"), + pytest.param(15, 5000, None, json_data[15:], id="offset=15, length=large"), # Read beyond end of file (should return empty bytes) pytest.param(file_size + 10, 5, None, b"", id="offset>size, length=5"), - # Read exactly at the end (zero length effective) + # Read exactly at the end (zero length) pytest.param(file_size, 10, None, b"", id="offset=size, length=10"), # Read with delimiter pytest.param(1, 35, b'\n', lines[1], id="offset=1, length=35, delimiter=newline"), @@ -33,31 +79,104 @@ @pytest.mark.parametrize("offset, length, delimiter, expected_data", read_block_params) -def test_read_block_zonal_path(gcs_adapter, offset, length, delimiter, expected_data): +def test_read_block_zb(gcs_adapter, zonal_mocks, offset, length, delimiter, expected_data): path = file_path - async def mock_download_ranges_side_effect(read_requests, **kwargs): - if read_requests and len(read_requests) == 1: - param_offset, param_length, buffer_arg = read_requests[0] - if hasattr(buffer_arg, 'write'): - buffer_arg.write(data[param_offset:param_offset + param_length]) - return [mock.Mock(error=None)] + with zonal_mocks(json_data) as mocks: + result = gcs_adapter.read_block(path, offset, length, delimiter) - patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" - mock_sync_get_layout = mock.Mock(return_value=BucketType.ZONAL_HIERARCHICAL) + assert result == expected_data + if mocks: + mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + if expected_data: + mocks["downloader"].download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) + else: + mocks["downloader"].download_ranges.assert_not_called() - patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" - mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) - mock_downloader.download_ranges = mock.AsyncMock(side_effect=mock_download_ranges_side_effect) - mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) - with mock.patch(patch_target_sync_layout, mock_sync_get_layout), \ - mock.patch(patch_target_create_mrd, mock_create_mrd): - result = gcs_adapter.read_block(path, offset, length, delimiter) +def test_read_small_zb(gcs_adapter, zonal_mocks): + csv_file = "2014-01-01.csv" + csv_file_path = f"{TEST_BUCKET}/{csv_file}" + csv_data = csv_files[csv_file] - assert result == expected_data - mock_sync_get_layout.assert_called_once_with(TEST_BUCKET) - if expected_data: - mock_downloader.download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) - else: - mock_downloader.download_ranges.assert_not_called() + with zonal_mocks(csv_data) as mocks: + with gcs_adapter.open(csv_file_path, "rb", block_size=10) as f: + out = [] + i = 1 + while True: + i += 1 + data = f.read(3) + if data == b"": + break + out.append(data) + assert gcs_adapter.cat(csv_file_path) == b"".join(out) + # cache drop + assert len(f.cache.cache) < len(out) + if mocks: + mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + + +def test_readline_zb(gcs_adapter, zonal_mocks): + all_items = chain.from_iterable( + [files.items(), csv_files.items(), text_files.items()] + ) + for k, data in all_items: + with zonal_mocks(data) as mocks: + with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: + result = f.readline() + expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") + assert result == expected + + +def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): + data = b"a,b\n11,22\n3,4" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(a, "wb") as f: + f.write(data) + with gcs_adapter.open(a, "rb") as f: + result = f.readline() + assert result == b"a,b\n" + assert f.loc == 4 + assert f.cache.cache == data + + result = f.readline() + assert result == b"11,22\n" + assert f.loc == 10 + assert f.cache.cache == data + + result = f.readline() + assert result == b"3,4" + assert f.loc == 13 + assert f.cache.cache == data + + +def test_readline_empty_zb(gcs_adapter, zonal_mocks): + data = b"" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(b, "wb") as f: + f.write(data) + with gcs_adapter.open(b, "rb") as f: + result = f.readline() + assert result == data + + +def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): + data = b"ab\n" + b"a" * (2 ** 18) + b"\nab" + with zonal_mocks(data) as mocks: + if mocks: + with gcs_adapter.open(c, "wb") as f: + f.write(data) + with gcs_adapter.open(c, "rb", block_size=2 ** 18) as f: + result = f.readline() + expected = b"ab\n" + assert result == expected + + result = f.readline() + expected = b"a" * (2 ** 18) + b"\n" + assert result == expected + + result = f.readline() + expected = b"ab" + assert result == expected From 1c99137611f82191af8edc72d1e69f49a83b73a7 Mon Sep 17 00:00:00 2001 From: suni72 Date: Tue, 28 Oct 2025 08:32:27 +0000 Subject: [PATCH 32/59] Update test_read_block_zb to use subtests to avoid frequent setup run --- gcsfs/tests/test_gcsfs_adapter.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index a5148785..3bfe4991 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -78,20 +78,22 @@ async def download_side_effect(read_requests, **kwargs): ] -@pytest.mark.parametrize("offset, length, delimiter, expected_data", read_block_params) -def test_read_block_zb(gcs_adapter, zonal_mocks, offset, length, delimiter, expected_data): - path = file_path - - with zonal_mocks(json_data) as mocks: - result = gcs_adapter.read_block(path, offset, length, delimiter) - - assert result == expected_data - if mocks: - mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) - if expected_data: - mocks["downloader"].download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) - else: - mocks["downloader"].download_ranges.assert_not_called() +def test_read_block_zb(gcs_adapter, zonal_mocks, subtests): + for param in read_block_params: + with subtests.test(id=param.id): + offset, length, delimiter, expected_data = param.values + path = file_path + + with zonal_mocks(json_data) as mocks: + result = gcs_adapter.read_block(path, offset, length, delimiter) + + assert result == expected_data + if mocks: + mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + if expected_data: + mocks["downloader"].download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) + else: + mocks["downloader"].download_ranges.assert_not_called() def test_read_small_zb(gcs_adapter, zonal_mocks): From 1099375dc10c04da4ca9d6658d44bed01b93f65f Mon Sep 17 00:00:00 2001 From: Mahalaxmi Date: Wed, 29 Oct 2025 06:37:48 +0000 Subject: [PATCH 33/59] fix: Optimizes info() and exists() methods This commit introduces an optimization to the `info()` method to reduce the number of GCS API calls when inspecting a directory-like path that only contains subdirectories without any immediate objects. Also includes a test case to check the number of API calls for this edge case --- gcsfs/core.py | 2 +- gcsfs/tests/test_core.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 91dd3d9f..18b4eb3e 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -796,7 +796,7 @@ async def _sequential_list_objects_helper( items.extend(page.get("items", [])) next_page_token = page.get("nextPageToken", None) - while len(items) < max_results and next_page_token is not None: + while len(items) + len(prefixes) < max_results and next_page_token is not None: num_items = min(items_per_call, max_results - len(items), 1000) page = await self._call( "GET", diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index f3cc3870..3a735e96 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -223,6 +223,27 @@ def test_info(gcs): assert gcs.info(a)["ctime"] == gcs.created(a) assert gcs.info(a)["mtime"] == gcs.modified(a) +def test_info_on_directory_with_only_subdirectories(gcs): + """Test info() on a path that contains no direct files but has subdirectories.""" + # Setup: create a file inside a nested directory + path = f"{TEST_BUCKET}/dir_with_only_subdirs/subdir1/file.txt" + gcs.touch(path) + path = f"{TEST_BUCKET}/dir_with_only_subdirs/subdir2/file.txt" + gcs.touch(path) + + # The path to test with info() + dir_path = f"{TEST_BUCKET}/dir_with_only_subdirs" + + # Mock the _call method to count invocations + with mock.patch.object(gcs, "_call", wraps=gcs._call) as mock_call: + # Get info for the directory + info = gcs.info(dir_path) + + # Assertions + assert info['type'] == 'directory' + assert info['name'] == dir_path + # one call to check for exact file and one for directory + assert mock_call.call_count == 2, "info() should only make two calls to GCS for a directory." def test_ls2(gcs): assert TEST_BUCKET + "/" in gcs.ls("") From d834b07d3e6b91f50d3c4f9863afde07f48472ab Mon Sep 17 00:00:00 2001 From: Mahalaxmi Date: Wed, 29 Oct 2025 06:37:48 +0000 Subject: [PATCH 34/59] fix: Optimizes info() and exists() methods This commit introduces an optimization to the `info()` method to reduce the number of GCS API calls when inspecting a directory-like path that only contains subdirectories without any immediate objects. Also includes a test case to check the number of API calls for this edge case --- gcsfs/tests/test_core.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 3a735e96..114f28af 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -230,6 +230,10 @@ def test_info_on_directory_with_only_subdirectories(gcs): gcs.touch(path) path = f"{TEST_BUCKET}/dir_with_only_subdirs/subdir2/file.txt" gcs.touch(path) + path = f"{TEST_BUCKET}/dir_with_only_subdirs/subdir3/file.txt" + gcs.touch(path) + path = f"{TEST_BUCKET}/dir_with_only_subdirs/subdir4/file.txt" + gcs.touch(path) # The path to test with info() dir_path = f"{TEST_BUCKET}/dir_with_only_subdirs" @@ -242,7 +246,7 @@ def test_info_on_directory_with_only_subdirectories(gcs): # Assertions assert info['type'] == 'directory' assert info['name'] == dir_path - # one call to check for exact file and one for directory + # one call is for exact file check and one call for directory assert mock_call.call_count == 2, "info() should only make two calls to GCS for a directory." def test_ls2(gcs): From bfd513f5bda16ab70c554edb4dbd10f07002c937 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 29 Oct 2025 14:48:40 +0000 Subject: [PATCH 35/59] fixes lint errors --- gcsfs/tests/test_core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 114f28af..a93dbd4c 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -223,6 +223,7 @@ def test_info(gcs): assert gcs.info(a)["ctime"] == gcs.created(a) assert gcs.info(a)["mtime"] == gcs.modified(a) + def test_info_on_directory_with_only_subdirectories(gcs): """Test info() on a path that contains no direct files but has subdirectories.""" # Setup: create a file inside a nested directory @@ -244,10 +245,13 @@ def test_info_on_directory_with_only_subdirectories(gcs): info = gcs.info(dir_path) # Assertions - assert info['type'] == 'directory' - assert info['name'] == dir_path + assert info["type"] == "directory" + assert info["name"] == dir_path # one call is for exact file check and one call for directory - assert mock_call.call_count == 2, "info() should only make two calls to GCS for a directory." + assert ( + mock_call.call_count == 2 + ), "info() should only make two calls to GCS for a directory." + def test_ls2(gcs): assert TEST_BUCKET + "/" in gcs.ls("") From efabe35ccfce6003e5fc4931e6bc096cfa1949c1 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 29 Oct 2025 18:36:51 +0000 Subject: [PATCH 36/59] fixes comments --- gcsfs/core.py | 2 +- gcsfs/gcsfs_adapter.py | 8 ++++++-- gcsfs/zonal_file.py | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index e43b30fc..68996926 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -284,7 +284,7 @@ class GCSFileSystem(asyn.AsyncFileSystem): def __new__(cls, *args, **kwargs): """ - Factory to return a GCSHNSFileSystem instance if the experimental + Factory to return a GCSFileSystemAdapter instance if the experimental flag is enabled. """ experimental_support = kwargs.pop('experimental_zb_hns_support', False) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index b5d66108..59885dc9 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -28,14 +28,18 @@ class BucketType(Enum): class GCSFileSystemAdapter(GCSFileSystem): """ - An subclass of GCSFileSystem that will contain specialized - logic for ZB and HNS buckets. + This class will be used when experimental_zb_hns_support is set to true for all bucket types. + GCSFileSystemAdapter is a subclass of GCSFileSystem that adds specialized + logic to support Zonal and Hierarchical buckets. """ def __init__(self, *args, **kwargs): kwargs.pop('experimental_zb_hns_support', None) super().__init__(*args, **kwargs) self.grpc_client = None + # grpc client initialisation is not a resource blocking operation hence + # initialising the client in early to reduce code duplication for initialisation in + # multiple methods self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) self._storage_layout_cache = {} diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index 9059fd7c..b967e500 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -6,8 +6,8 @@ class ZonalFile(GCSFile): """ - GCSFile subclass designed to handle reads from - Zonal buckets using a high-performance gRPC path. + ZonalFile is subclass of GCSFile and handles data operations from + Zonal buckets only using a high-performance gRPC path. """ def __init__(self, *args, **kwargs): From 2ed3cc6c97f47fffb8917c33aa51450a240276eb Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 29 Oct 2025 18:38:25 +0000 Subject: [PATCH 37/59] fixes lint errors --- gcsfs/core.py | 4 +- gcsfs/gcsfs_adapter.py | 54 ++++++++++++++---------- gcsfs/tests/conftest.py | 10 ++++- gcsfs/tests/test_core.py | 16 +++++--- gcsfs/tests/test_gcsfs_adapter.py | 68 ++++++++++++++++++++----------- gcsfs/tests/test_zb_hns_utils.py | 2 +- gcsfs/zb_hns_utils.py | 3 +- gcsfs/zonal_file.py | 8 +++- requirements.txt | 2 +- 9 files changed, 109 insertions(+), 58 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 68996926..95308a8e 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -68,6 +68,7 @@ "custom_time": "customTime", } + def quote(s): """ Quote characters to be safe for URL paths. @@ -287,10 +288,11 @@ def __new__(cls, *args, **kwargs): Factory to return a GCSFileSystemAdapter instance if the experimental flag is enabled. """ - experimental_support = kwargs.pop('experimental_zb_hns_support', False) + experimental_support = kwargs.pop("experimental_zb_hns_support", False) if experimental_support: from .gcsfs_adapter import GCSFileSystemAdapter + return object.__new__(GCSFileSystemAdapter) else: return object.__new__(cls) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 59885dc9..8a716ecc 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -2,10 +2,11 @@ from enum import Enum from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from google.cloud.storage._experimental.asyncio.async_grpc_client import \ + AsyncGrpcClient from . import zb_hns_utils -from .core import GCSFileSystem, GCSFile +from .core import GCSFile, GCSFileSystem from .zonal_file import ZonalFile logger = logging.getLogger("gcsfs") @@ -34,7 +35,7 @@ class GCSFileSystemAdapter(GCSFileSystem): """ def __init__(self, *args, **kwargs): - kwargs.pop('experimental_zb_hns_support', None) + kwargs.pop("experimental_zb_hns_support", None) super().__init__(*args, **kwargs) self.grpc_client = None # grpc client initialisation is not a resource blocking operation hence @@ -53,7 +54,9 @@ async def _get_storage_layout(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] try: - response = await self._call("GET", f"b/{bucket}/storageLayout", json_out=True) + response = await self._call( + "GET", f"b/{bucket}/storageLayout", json_out=True + ) if response.get("locationType") == "zone": self._storage_layout_cache[bucket] = BucketType.ZONAL_HIERARCHICAL else: @@ -68,25 +71,27 @@ async def _get_storage_layout(self, bucket): _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) def _open( - self, - path, - mode="rb", - block_size=None, - cache_options=None, - acl=None, - consistency=None, - metadata=None, - autocommit=True, - fixed_key_metadata=None, - generation=None, - **kwargs, + self, + path, + mode="rb", + block_size=None, + cache_options=None, + acl=None, + consistency=None, + metadata=None, + autocommit=True, + fixed_key_metadata=None, + generation=None, + **kwargs, ): """ Open a file. """ bucket, _, _ = self.split_path(path) bucket_type = self._sync_get_storage_layout(bucket) - return gcs_file_types[bucket_type](self, path, + return gcs_file_types[bucket_type]( + self, + path, mode, block_size, cache_options=cache_options, @@ -95,7 +100,8 @@ def _open( acl=acl, autocommit=autocommit, fixed_key_metadata=fixed_key_metadata, - **kwargs,) + **kwargs, + ) # Replacement method for _process_limits to support new params (offset and length) for MRD. async def process_limits_to_offset_and_length(self, path, start, end): @@ -136,7 +142,9 @@ async def process_limits_to_offset_and_length(self, path, start, end): if offset < 0: raise ValueError(f"Calculated start offset ({offset}) cannot be negative.") if effective_end < offset: - raise ValueError(f"Calculated end position ({effective_end}) cannot be before start offset ({offset}).") + raise ValueError( + f"Calculated end position ({effective_end}) cannot be before start offset ({offset})." + ) elif effective_end == offset: length = 0 # Handle zero-length slice elif effective_end > size: @@ -168,7 +176,11 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): if not await self.is_zonal_bucket(bucket): return await super()._cat_file(path, start=start, end=end, **kwargs) - mrd = await zb_hns_utils.create_mrd(self.grpc_client, bucket, object_name, generation) + mrd = await zb_hns_utils.create_mrd( + self.grpc_client, bucket, object_name, generation + ) - offset, length = await self.process_limits_to_offset_and_length(path, start, end) + offset, length = await self.process_limits_to_offset_and_length( + path, start, end + ) return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index d94ceaa9..65893dbf 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -123,11 +123,14 @@ def gcs(gcs_factory, populate=True): except: # noqa: E722 pass + @pytest.fixture def gcs_adapter(gcs_factory, populate=True): gcs_adapter = gcs_factory(experimental_zb_hns_support=True) # Check if we are running against a real GCS endpoint - is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + is_real_gcs = ( + os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + ) try: # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint if not is_real_gcs: @@ -140,7 +143,9 @@ def gcs_adapter(gcs_factory, populate=True): except Exception: pass if populate: - gcs_adapter.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) + gcs_adapter.pipe( + {TEST_BUCKET + "/" + k: v for k, v in allfiles.items()} + ) gcs_adapter.invalidate_cache() yield gcs_adapter finally: @@ -152,6 +157,7 @@ def gcs_adapter(gcs_factory, populate=True): except Exception: pass + @pytest.fixture def gcs_versioned(gcs_factory): gcs = gcs_factory() diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 442178cc..7336990f 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1708,13 +1708,17 @@ def test_get_error(gcs): def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory): gcs = gcs_factory() - assert isinstance(gcs, gcsfs.GCSFileSystem), "Expected File system instance to be GCSFileSystem" - assert not isinstance(gcs, - GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystem" + assert isinstance( + gcs, gcsfs.GCSFileSystem + ), "Expected File system instance to be GCSFileSystem" + assert not isinstance( + gcs, GCSFileSystemAdapter + ), "Expected File system instance to be GCSFileSystem" + def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): gcs = gcs_factory(experimental_zb_hns_support=True) - assert isinstance(gcs, - GCSFileSystemAdapter), "Expected File system instance to be GCSFileSystemAdapter" - + assert isinstance( + gcs, GCSFileSystemAdapter + ), "Expected File system instance to be GCSFileSystemAdapter" diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 3bfe4991..8a9160e8 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -5,7 +5,8 @@ from unittest import mock import pytest -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ + AsyncMultiRangeDownloader from gcsfs.gcsfs_adapter import BucketType from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files @@ -25,34 +26,51 @@ def zonal_mocks(): @contextlib.contextmanager def _zonal_mocks_factory(file_data): """Creates mocks for a given file content.""" - is_real_gcs = os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + is_real_gcs = ( + os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + ) if is_real_gcs: yield None return - patch_target_get_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_storage_layout" - patch_target_sync_layout = "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + patch_target_get_layout = ( + "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_storage_layout" + ) + patch_target_sync_layout = ( + "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + ) patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" async def download_side_effect(read_requests, **kwargs): if read_requests and len(read_requests) == 1: param_offset, param_length, buffer_arg = read_requests[0] - if hasattr(buffer_arg, 'write'): - buffer_arg.write(file_data[param_offset:param_offset + param_length]) + if hasattr(buffer_arg, "write"): + buffer_arg.write( + file_data[param_offset : param_offset + param_length] + ) return [mock.Mock(error=None)] mock_downloader = mock.Mock(spec=AsyncMultiRangeDownloader) - mock_downloader.download_ranges = mock.AsyncMock(side_effect=download_side_effect) + mock_downloader.download_ranges = mock.AsyncMock( + side_effect=download_side_effect + ) mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) - with mock.patch(patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL) as mock_sync_layout, \ - mock.patch(patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL), \ - mock.patch(patch_target_create_mrd, mock_create_mrd), \ - mock.patch(patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock) as mock_cat_file: - mocks = {"sync_layout": mock_sync_layout, - "create_mrd": mock_create_mrd, - "downloader": mock_downloader, - "cat_file": mock_cat_file} + with mock.patch( + patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL + ) as mock_sync_layout, mock.patch( + patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL + ), mock.patch( + patch_target_create_mrd, mock_create_mrd + ), mock.patch( + patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock + ) as mock_cat_file: + mocks = { + "sync_layout": mock_sync_layout, + "create_mrd": mock_create_mrd, + "downloader": mock_downloader, + "cat_file": mock_cat_file, + } yield mocks # Common assertion for all tests using this mock mock_cat_file.assert_not_called() @@ -62,7 +80,7 @@ async def download_side_effect(read_requests, **kwargs): read_block_params = [ # Read specific chunk - pytest.param(3, 10, None, json_data[3:3 + 10], id="offset=3, length=10"), + pytest.param(3, 10, None, json_data[3 : 3 + 10], id="offset=3, length=10"), # Read from beginning up to length pytest.param(0, 5, None, json_data[0:5], id="offset=0, length=5"), # Read from offset to end (simulate large length) @@ -72,9 +90,11 @@ async def download_side_effect(read_requests, **kwargs): # Read exactly at the end (zero length) pytest.param(file_size, 10, None, b"", id="offset=size, length=10"), # Read with delimiter - pytest.param(1, 35, b'\n', lines[1], id="offset=1, length=35, delimiter=newline"), - pytest.param(0, 30, b'\n', lines[0], id="offset=0, length=35, delimiter=newline"), - pytest.param(0, 35, b'\n', lines[0] + lines[1], id="offset=0, length=35, delimiter=newline"), + pytest.param(1, 35, b"\n", lines[1], id="offset=1, length=35, delimiter=newline"), + pytest.param(0, 30, b"\n", lines[0], id="offset=0, length=35, delimiter=newline"), + pytest.param( + 0, 35, b"\n", lines[0] + lines[1], id="offset=0, length=35, delimiter=newline" + ), ] @@ -91,7 +111,9 @@ def test_read_block_zb(gcs_adapter, zonal_mocks, subtests): if mocks: mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) if expected_data: - mocks["downloader"].download_ranges.assert_called_with([(offset, mock.ANY, mock.ANY)]) + mocks["downloader"].download_ranges.assert_called_with( + [(offset, mock.ANY, mock.ANY)] + ) else: mocks["downloader"].download_ranges.assert_not_called() @@ -165,18 +187,18 @@ def test_readline_empty_zb(gcs_adapter, zonal_mocks): def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): - data = b"ab\n" + b"a" * (2 ** 18) + b"\nab" + data = b"ab\n" + b"a" * (2**18) + b"\nab" with zonal_mocks(data) as mocks: if mocks: with gcs_adapter.open(c, "wb") as f: f.write(data) - with gcs_adapter.open(c, "rb", block_size=2 ** 18) as f: + with gcs_adapter.open(c, "rb", block_size=2**18) as f: result = f.readline() expected = b"ab\n" assert result == expected result = f.readline() - expected = b"a" * (2 ** 18) + b"\n" + expected = b"a" * (2**18) + b"\n" assert result == expected result = f.readline() diff --git a/gcsfs/tests/test_zb_hns_utils.py b/gcsfs/tests/test_zb_hns_utils.py index 23682b85..51e34ce8 100644 --- a/gcsfs/tests/test_zb_hns_utils.py +++ b/gcsfs/tests/test_zb_hns_utils.py @@ -54,4 +54,4 @@ async def mock_download_ranges(ranges): result = await zb_hns_utils.download_range(offset, length, mock_mrd) mock_mrd.download_ranges.assert_called_once_with([(offset, length, mock.ANY)]) - assert result == expected_data \ No newline at end of file + assert result == expected_data diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py index 2fb8dfbe..aa8ff3e0 100644 --- a/gcsfs/zb_hns_utils.py +++ b/gcsfs/zb_hns_utils.py @@ -1,6 +1,7 @@ from io import BytesIO -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ + AsyncMultiRangeDownloader async def create_mrd(grpc_client, bucket_name, object_name, generation=None): diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index b967e500..be5df204 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -17,13 +17,17 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.mrd = None if "r" in self.mode: - self.mrd = asyn.sync(self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation) + self.mrd = asyn.sync( + self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation + ) async def _init_mrd(self, bucket_name, object_name, generation=None): """ Initializes the AsyncMultiRangeDownloader. """ - return await zb_hns_utils.create_mrd(self.gcsfs.grpc_client, bucket_name, object_name, generation) + return await zb_hns_utils.create_mrd( + self.gcsfs.grpc_client, bucket_name, object_name, generation + ) def _fetch_range(self, start, end): """ diff --git a/requirements.txt b/requirements.txt index 448e8b3a..032e04f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ fsspec==2025.9.0 google-auth>=1.2 google-auth-oauthlib google-cloud-storage +pytest requests -pytest \ No newline at end of file From 957d7b518fa5b964d432481322eea93d854ab459 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 29 Oct 2025 19:00:26 +0000 Subject: [PATCH 38/59] adds grpc and google-iam dependency --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 032e04f9..c4d099f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,8 @@ decorator>4.1.2 fsspec==2025.9.0 google-auth>=1.2 google-auth-oauthlib +google-cloud-iam google-cloud-storage +grpcio pytest requests From cd222cbbad262d128779e74e6365975156740d1f Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Thu, 30 Oct 2025 09:43:56 +0000 Subject: [PATCH 39/59] Fix missing argument in open made private: _is_zonal_bucket and _process_limits_to_offset_and_length fix process method to handle negative length add unit test for _process_limits_to_offset_and_length --- gcsfs/gcsfs_adapter.py | 26 ++++++++++----------- gcsfs/tests/test_gcsfs_adapter.py | 38 +++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 8a716ecc..43a314e9 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -2,8 +2,7 @@ from enum import Enum from fsspec import asyn -from google.cloud.storage._experimental.asyncio.async_grpc_client import \ - AsyncGrpcClient +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient from . import zb_hns_utils from .core import GCSFile, GCSFileSystem @@ -38,8 +37,8 @@ def __init__(self, *args, **kwargs): kwargs.pop("experimental_zb_hns_support", None) super().__init__(*args, **kwargs) self.grpc_client = None - # grpc client initialisation is not a resource blocking operation hence - # initialising the client in early to reduce code duplication for initialisation in + # grpc client initialization is not a resource blocking operation hence + # initializing the client in early to reduce code duplication for initialization in # multiple methods self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) self._storage_layout_cache = {} @@ -100,11 +99,12 @@ def _open( acl=acl, autocommit=autocommit, fixed_key_metadata=fixed_key_metadata, + generation=generation, **kwargs, ) # Replacement method for _process_limits to support new params (offset and length) for MRD. - async def process_limits_to_offset_and_length(self, path, start, end): + async def _process_limits_to_offset_and_length(self, path, start, end): """ Calculates the read offset and length from start and end parameters. @@ -148,17 +148,17 @@ async def process_limits_to_offset_and_length(self, path, start, end): elif effective_end == offset: length = 0 # Handle zero-length slice elif effective_end > size: - length = size - offset # Clamp length to file size + length = max(0, size - offset) # Clamp and ensure non-negative else: length = effective_end - offset # Normal case - if offset + length > size: - # This might happen with large positive end values - length = max(0, size - offset) - return offset, length - async def is_zonal_bucket(self, bucket): + sync_process_limits_to_offset_and_length = asyn.sync_wrapper( + _process_limits_to_offset_and_length + ) + + async def _is_zonal_bucket(self, bucket): layout = await self._get_storage_layout(bucket) return layout == BucketType.ZONAL_HIERARCHICAL @@ -173,14 +173,14 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): if mrd is None: bucket, object_name, generation = self.split_path(path) # Fall back to default implementation if not a zonal bucket - if not await self.is_zonal_bucket(bucket): + if not await self._is_zonal_bucket(bucket): return await super()._cat_file(path, start=start, end=end, **kwargs) mrd = await zb_hns_utils.create_mrd( self.grpc_client, bucket, object_name, generation ) - offset, length = await self.process_limits_to_offset_and_length( + offset, length = await self._process_limits_to_offset_and_length( path, start, end ) return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 8a9160e8..fe1943db 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -5,8 +5,9 @@ from unittest import mock import pytest -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ - AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( + AsyncMultiRangeDownloader, +) from gcsfs.gcsfs_adapter import BucketType from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files @@ -204,3 +205,36 @@ def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): result = f.readline() expected = b"ab" assert result == expected + + +@pytest.mark.parametrize( + "start,end,exp_offset,exp_length,exp_exc", + [ + (None, None, 0, file_size, None), # full file + (-10, None, file_size - 10, 10, None), # start negative + (10, -10, 10, file_size - 20, None), # end negative + (20, 20, 20, 0, None), # zero-length slice + (50, 40, None, None, ValueError), # end before start -> raises + (-200, None, None, None, ValueError), # offset negative -> raises + (file_size - 10, 200, file_size - 10, 10, None), # end > size clamps + ( + file_size + 10, + file_size + 20, + file_size + 10, + 0, + None, + ), # offset > size -> empty + ], +) +def test_process_limits_parametrized( + gcs_adapter, start, end, exp_offset, exp_length, exp_exc +): + if exp_exc is not None: + with pytest.raises(exp_exc): + gcs_adapter.sync_process_limits_to_offset_and_length(file_path, start, end) + else: + offset, length = gcs_adapter.sync_process_limits_to_offset_and_length( + file_path, start, end + ) + assert offset == exp_offset + assert length == exp_length From 17618d5f5b207dfc515b1cc347190e254e12e6ec Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Thu, 30 Oct 2025 11:26:35 +0000 Subject: [PATCH 40/59] Fix: Raise NotImplementedError for modes other than read in Zonal bucket Add test for failure scenarios in mrd --- gcsfs/tests/test_gcsfs_adapter.py | 48 +++++++++++++++++++++++++------ gcsfs/zonal_file.py | 4 +++ 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index fe1943db..8bb29dfb 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -3,6 +3,7 @@ import os from itertools import chain from unittest import mock +from google.cloud.storage.exceptions import DataCorruption import pytest from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( @@ -155,10 +156,10 @@ def test_readline_zb(gcs_adapter, zonal_mocks): def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): data = b"a,b\n11,22\n3,4" + if not gcs_adapter.on_google: + with gcs_adapter.open(a, "wb") as f: + f.write(data) with zonal_mocks(data) as mocks: - if mocks: - with gcs_adapter.open(a, "wb") as f: - f.write(data) with gcs_adapter.open(a, "rb") as f: result = f.readline() assert result == b"a,b\n" @@ -178,10 +179,10 @@ def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): def test_readline_empty_zb(gcs_adapter, zonal_mocks): data = b"" + if not gcs_adapter.on_google: + with gcs_adapter.open(b, "wb") as f: + f.write(data) with zonal_mocks(data) as mocks: - if mocks: - with gcs_adapter.open(b, "wb") as f: - f.write(data) with gcs_adapter.open(b, "rb") as f: result = f.readline() assert result == data @@ -189,10 +190,10 @@ def test_readline_empty_zb(gcs_adapter, zonal_mocks): def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): data = b"ab\n" + b"a" * (2**18) + b"\nab" + if not gcs_adapter.on_google: + with gcs_adapter.open(c, "wb") as f: + f.write(data) with zonal_mocks(data) as mocks: - if mocks: - with gcs_adapter.open(c, "wb") as f: - f.write(data) with gcs_adapter.open(c, "rb", block_size=2**18) as f: result = f.readline() expected = b"ab\n" @@ -238,3 +239,32 @@ def test_process_limits_parametrized( ) assert offset == exp_offset assert length == exp_length + + +@pytest.mark.parametrize( + "exception_to_raise", + [ValueError, DataCorruption, Exception], +) +def test_mrd_exception_handling(gcs_adapter, zonal_mocks, exception_to_raise): + """ + Tests that _cat_file correctly propagates exceptions from mrd.download_ranges. + """ + with zonal_mocks(json_data) as mocks: + if gcs_adapter.on_google: + pytest.skip("Cannot mock exceptions on real GCS") + + # Configure the mock to raise a specified exception + if exception_to_raise is DataCorruption: + # The first argument is 'response', the message is in '*args' + mocks["downloader"].download_ranges.side_effect = exception_to_raise( + None, "Test exception raised" + ) + else: + mocks["downloader"].download_ranges.side_effect = exception_to_raise( + "Test exception raised" + ) + + with pytest.raises(exception_to_raise, match="Test exception raised"): + gcs_adapter.read_block(file_path, 0, 10) + + mocks["downloader"].download_ranges.assert_called_once() diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index be5df204..1118df8b 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -20,6 +20,10 @@ def __init__(self, *args, **kwargs): self.mrd = asyn.sync( self.gcsfs.loop, self._init_mrd, self.bucket, self.key, self.generation ) + else: + raise NotImplementedError( + "Only read operations are currently supported for Zonal buckets." + ) async def _init_mrd(self, bucket_name, object_name, generation=None): """ From 064c286f977d132dce085077f71f13c28b10fd5e Mon Sep 17 00:00:00 2001 From: Sunidhi Chandra Date: Thu, 30 Oct 2025 16:12:42 +0000 Subject: [PATCH 41/59] Add ClientInfo in AsyncGrpcClient --- gcsfs/gcsfs_adapter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 43a314e9..19995a07 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -2,9 +2,11 @@ from enum import Enum from fsspec import asyn +from google.api_core.client_info import ClientInfo from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient from . import zb_hns_utils +from . import __version__ as version from .core import GCSFile, GCSFileSystem from .zonal_file import ZonalFile @@ -45,7 +47,9 @@ def __init__(self, *args, **kwargs): async def _create_grpc_client(self): if self.grpc_client is None: - return AsyncGrpcClient().grpc_client + return AsyncGrpcClient( + client_info=ClientInfo(user_agent=f"python-gcsfs/{version}") + ).grpc_client else: return self.grpc_client From 8b2a8d92c895333179e0b2db77c3d3534ce00036 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Fri, 31 Oct 2025 11:48:00 +0000 Subject: [PATCH 42/59] refactors storage layout to use sdk control client --- .gitignore | 1 - gcsfs/gcsfs_adapter.py | 39 ++++++++++++++++++++++++++++++--------- requirements.txt | 1 + 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 9cc47b5e..023f552c 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,3 @@ target/ .pytest_cache/ libs/*.whl -*zonal_test.py diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 19995a07..c2963bfd 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -3,15 +3,19 @@ from fsspec import asyn from google.api_core.client_info import ClientInfo +from google.api_core import gapic_v1 from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from google.api_core import exceptions as api_exceptions from . import zb_hns_utils from . import __version__ as version from .core import GCSFile, GCSFileSystem from .zonal_file import ZonalFile +from google.cloud import storage_control_v2 logger = logging.getLogger("gcsfs") +USER_AGENT="python-gcsfs" class BucketType(Enum): ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" @@ -39,34 +43,51 @@ def __init__(self, *args, **kwargs): kwargs.pop("experimental_zb_hns_support", None) super().__init__(*args, **kwargs) self.grpc_client = None - # grpc client initialization is not a resource blocking operation hence - # initializing the client in early to reduce code duplication for initialization in - # multiple methods + self.storage_control_client = None + # initializing grpc and storage control client for Hierarchical and + # zonal bucket operations self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) + self._storage_control_client = asyn.sync(self.loop, self._create_control_plane_client) self._storage_layout_cache = {} async def _create_grpc_client(self): if self.grpc_client is None: return AsyncGrpcClient( - client_info=ClientInfo(user_agent=f"python-gcsfs/{version}") + client_info=ClientInfo(user_agent=f"{USER_AGENT}/{version}"), ).grpc_client else: return self.grpc_client + async def _create_control_plane_client(self): + # Initialize the storage control plane client for bucket + # metadata operations + client_info = gapic_v1.client_info.ClientInfo(user_agent=f"{USER_AGENT}/{version}") + return storage_control_v2.StorageControlAsyncClient( + credentials=self.credentials.credentials, + client_info=client_info + ) + async def _get_storage_layout(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] try: - response = await self._call( - "GET", f"b/{bucket}/storageLayout", json_out=True - ) - if response.get("locationType") == "zone": + + # Bucket name details + bucket_name_value=f"projects/_/buckets/{bucket}/storageLayout" + # Make the request to get bucket type + response = await self._storage_control_client.get_storage_layout(name=bucket_name_value) + + if response.location_type == "zone": self._storage_layout_cache[bucket] = BucketType.ZONAL_HIERARCHICAL else: # This should be updated to include HNS in the future self._storage_layout_cache[bucket] = BucketType.NON_HIERARCHICAL + except api_exceptions.NotFound: + print( + f"Error: Bucket {bucket} not found or you lack permissions.") + return None except Exception as e: - logger.error(f"Could not determine storage layout for bucket {bucket}: {e}") + logger.error(f"Could not determine bucket type for bucket name {bucket}: {e}") # Default to UNKNOWN self._storage_layout_cache[bucket] = BucketType.UNKNOWN return self._storage_layout_cache[bucket] diff --git a/requirements.txt b/requirements.txt index c4d099f1..ffb41950 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ google-cloud-storage grpcio pytest requests +google-cloud-storage-control \ No newline at end of file From b1f0117d1dab5dc95562d91c52a67773b278d877 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Fri, 31 Oct 2025 11:57:12 +0000 Subject: [PATCH 43/59] fixes lint errors --- gcsfs/gcsfs_adapter.py | 38 +++++++++++++++++++------------ gcsfs/tests/test_gcsfs_adapter.py | 13 +++++------ requirements.txt | 2 +- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index c2963bfd..6be7229b 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -2,20 +2,22 @@ from enum import Enum from fsspec import asyn -from google.api_core.client_info import ClientInfo -from google.api_core import gapic_v1 -from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient from google.api_core import exceptions as api_exceptions +from google.api_core import gapic_v1 +from google.api_core.client_info import ClientInfo +from google.cloud import storage_control_v2 +from google.cloud.storage._experimental.asyncio.async_grpc_client import \ + AsyncGrpcClient -from . import zb_hns_utils from . import __version__ as version +from . import zb_hns_utils from .core import GCSFile, GCSFileSystem from .zonal_file import ZonalFile -from google.cloud import storage_control_v2 logger = logging.getLogger("gcsfs") -USER_AGENT="python-gcsfs" +USER_AGENT = "python-gcsfs" + class BucketType(Enum): ZONAL_HIERARCHICAL = "ZONAL_HIERARCHICAL" @@ -47,7 +49,9 @@ def __init__(self, *args, **kwargs): # initializing grpc and storage control client for Hierarchical and # zonal bucket operations self.grpc_client = asyn.sync(self.loop, self._create_grpc_client) - self._storage_control_client = asyn.sync(self.loop, self._create_control_plane_client) + self._storage_control_client = asyn.sync( + self.loop, self._create_control_plane_client + ) self._storage_layout_cache = {} async def _create_grpc_client(self): @@ -61,10 +65,11 @@ async def _create_grpc_client(self): async def _create_control_plane_client(self): # Initialize the storage control plane client for bucket # metadata operations - client_info = gapic_v1.client_info.ClientInfo(user_agent=f"{USER_AGENT}/{version}") + client_info = gapic_v1.client_info.ClientInfo( + user_agent=f"{USER_AGENT}/{version}" + ) return storage_control_v2.StorageControlAsyncClient( - credentials=self.credentials.credentials, - client_info=client_info + credentials=self.credentials.credentials, client_info=client_info ) async def _get_storage_layout(self, bucket): @@ -73,9 +78,11 @@ async def _get_storage_layout(self, bucket): try: # Bucket name details - bucket_name_value=f"projects/_/buckets/{bucket}/storageLayout" + bucket_name_value = f"projects/_/buckets/{bucket}/storageLayout" # Make the request to get bucket type - response = await self._storage_control_client.get_storage_layout(name=bucket_name_value) + response = await self._storage_control_client.get_storage_layout( + name=bucket_name_value + ) if response.location_type == "zone": self._storage_layout_cache[bucket] = BucketType.ZONAL_HIERARCHICAL @@ -83,11 +90,12 @@ async def _get_storage_layout(self, bucket): # This should be updated to include HNS in the future self._storage_layout_cache[bucket] = BucketType.NON_HIERARCHICAL except api_exceptions.NotFound: - print( - f"Error: Bucket {bucket} not found or you lack permissions.") + print(f"Error: Bucket {bucket} not found or you lack permissions.") return None except Exception as e: - logger.error(f"Could not determine bucket type for bucket name {bucket}: {e}") + logger.error( + f"Could not determine bucket type for bucket name {bucket}: {e}" + ) # Default to UNKNOWN self._storage_layout_cache[bucket] = BucketType.UNKNOWN return self._storage_layout_cache[bucket] diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 8bb29dfb..a48b6d65 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -3,12 +3,11 @@ import os from itertools import chain from unittest import mock -from google.cloud.storage.exceptions import DataCorruption import pytest -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( - AsyncMultiRangeDownloader, -) +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ + AsyncMultiRangeDownloader +from google.cloud.storage.exceptions import DataCorruption from gcsfs.gcsfs_adapter import BucketType from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files @@ -158,7 +157,7 @@ def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): data = b"a,b\n11,22\n3,4" if not gcs_adapter.on_google: with gcs_adapter.open(a, "wb") as f: - f.write(data) + f.write(data) with zonal_mocks(data) as mocks: with gcs_adapter.open(a, "rb") as f: result = f.readline() @@ -181,7 +180,7 @@ def test_readline_empty_zb(gcs_adapter, zonal_mocks): data = b"" if not gcs_adapter.on_google: with gcs_adapter.open(b, "wb") as f: - f.write(data) + f.write(data) with zonal_mocks(data) as mocks: with gcs_adapter.open(b, "rb") as f: result = f.readline() @@ -192,7 +191,7 @@ def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): data = b"ab\n" + b"a" * (2**18) + b"\nab" if not gcs_adapter.on_google: with gcs_adapter.open(c, "wb") as f: - f.write(data) + f.write(data) with zonal_mocks(data) as mocks: with gcs_adapter.open(c, "rb", block_size=2**18) as f: result = f.readline() diff --git a/requirements.txt b/requirements.txt index ffb41950..f62203c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ google-auth>=1.2 google-auth-oauthlib google-cloud-iam google-cloud-storage +google-cloud-storage-control grpcio pytest requests -google-cloud-storage-control \ No newline at end of file From c47b53f946d0ad2b148f5a4dbe360ae8e91778e6 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 16:37:09 +0000 Subject: [PATCH 44/59] fixes lint errors --- gcsfs/core.py | 1 - gcsfs/tests/test_gcsfs_adapter.py | 68 +++++++++++++++---------------- gcsfs/tests/test_zb_hns_utils.py | 1 - 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index c0b8f949..4bccecdc 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -13,7 +13,6 @@ import warnings import weakref from datetime import datetime, timedelta -from enum import Enum from urllib.parse import parse_qs from urllib.parse import quote as quote_urllib from urllib.parse import urlsplit diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index a48b6d65..988d7726 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -146,11 +146,10 @@ def test_readline_zb(gcs_adapter, zonal_mocks): [files.items(), csv_files.items(), text_files.items()] ) for k, data in all_items: - with zonal_mocks(data) as mocks: - with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: - result = f.readline() - expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") - assert result == expected + with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: + result = f.readline() + expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") + assert result == expected def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): @@ -158,22 +157,21 @@ def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(a, "wb") as f: f.write(data) - with zonal_mocks(data) as mocks: - with gcs_adapter.open(a, "rb") as f: - result = f.readline() - assert result == b"a,b\n" - assert f.loc == 4 - assert f.cache.cache == data + with gcs_adapter.open(a, "rb") as f: + result = f.readline() + assert result == b"a,b\n" + assert f.loc == 4 + assert f.cache.cache == data - result = f.readline() - assert result == b"11,22\n" - assert f.loc == 10 - assert f.cache.cache == data + result = f.readline() + assert result == b"11,22\n" + assert f.loc == 10 + assert f.cache.cache == data - result = f.readline() - assert result == b"3,4" - assert f.loc == 13 - assert f.cache.cache == data + result = f.readline() + assert result == b"3,4" + assert f.loc == 13 + assert f.cache.cache == data def test_readline_empty_zb(gcs_adapter, zonal_mocks): @@ -181,10 +179,9 @@ def test_readline_empty_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(b, "wb") as f: f.write(data) - with zonal_mocks(data) as mocks: - with gcs_adapter.open(b, "rb") as f: - result = f.readline() - assert result == data + with gcs_adapter.open(b, "rb") as f: + result = f.readline() + assert result == data def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): @@ -192,19 +189,18 @@ def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(c, "wb") as f: f.write(data) - with zonal_mocks(data) as mocks: - with gcs_adapter.open(c, "rb", block_size=2**18) as f: - result = f.readline() - expected = b"ab\n" - assert result == expected - - result = f.readline() - expected = b"a" * (2**18) + b"\n" - assert result == expected - - result = f.readline() - expected = b"ab" - assert result == expected + with gcs_adapter.open(c, "rb", block_size=2**18) as f: + result = f.readline() + expected = b"ab\n" + assert result == expected + + result = f.readline() + expected = b"a" * (2**18) + b"\n" + assert result == expected + + result = f.readline() + expected = b"ab" + assert result == expected @pytest.mark.parametrize( diff --git a/gcsfs/tests/test_zb_hns_utils.py b/gcsfs/tests/test_zb_hns_utils.py index 51e34ce8..78976230 100644 --- a/gcsfs/tests/test_zb_hns_utils.py +++ b/gcsfs/tests/test_zb_hns_utils.py @@ -1,4 +1,3 @@ -from io import BytesIO from unittest import mock import pytest From e37d0fb6e47e76670506d2b331b39a134e3bc99d Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 17:50:12 +0000 Subject: [PATCH 45/59] fixes lint errors --- environment_gcsfs.yaml | 2 ++ gcsfs/tests/test_fuse.py | 2 +- requirements.txt | 4 ---- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/environment_gcsfs.yaml b/environment_gcsfs.yaml index 4cd6ff8b..95ad8436 100644 --- a/environment_gcsfs.yaml +++ b/environment_gcsfs.yaml @@ -13,6 +13,8 @@ dependencies: - google-auth-oauthlib - google-cloud-core - google-cloud-storage + - google-cloud-storage-control + - grpcio - pytest - pytest-timeout - pytest-asyncio diff --git a/gcsfs/tests/test_fuse.py b/gcsfs/tests/test_fuse.py index f1f494cd..3c3b0c3b 100644 --- a/gcsfs/tests/test_fuse.py +++ b/gcsfs/tests/test_fuse.py @@ -40,7 +40,7 @@ def test_fuse(gcs, fsspec_fuse_run): timeout = 20 n = 40 for i in range(n): - logging.debug(f"Attempt # {i+1}/{n} to create lock file.") + logging.debug(f"Attempt # {i + 1} / {n} to create lock file.") try: open(os.path.join(mountpath, "lock"), "w").close() os.remove(os.path.join(mountpath, "lock")) diff --git a/requirements.txt b/requirements.txt index d69725fb..f82b0722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,5 @@ decorator>4.1.2 fsspec==2025.10.0 google-auth>=1.2 google-auth-oauthlib -google-cloud-iam google-cloud-storage -google-cloud-storage-control -grpcio -pytest requests From 11af0000c196a3e856308afc0629cc8a5317067d Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 18:06:21 +0000 Subject: [PATCH 46/59] fixes conda install error --- environment_gcsfs.yaml | 1 - requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/environment_gcsfs.yaml b/environment_gcsfs.yaml index 95ad8436..c5a0efba 100644 --- a/environment_gcsfs.yaml +++ b/environment_gcsfs.yaml @@ -13,7 +13,6 @@ dependencies: - google-auth-oauthlib - google-cloud-core - google-cloud-storage - - google-cloud-storage-control - grpcio - pytest - pytest-timeout diff --git a/requirements.txt b/requirements.txt index f82b0722..28e2cafa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ fsspec==2025.10.0 google-auth>=1.2 google-auth-oauthlib google-cloud-storage +google-cloud-storage-control requests From c4cf777ffa0837c894e7f35cb6d06ca8df5925d4 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 18:47:55 +0000 Subject: [PATCH 47/59] mocks fake test credentials for grpc client --- gcsfs/tests/conftest.py | 12 +++++++----- gcsfs/tests/test_core.py | 10 ++++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 65893dbf..25eda414 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -2,6 +2,7 @@ import shlex import subprocess import time +from unittest.mock import patch import fsspec import pytest @@ -126,11 +127,12 @@ def gcs(gcs_factory, populate=True): @pytest.fixture def gcs_adapter(gcs_factory, populate=True): - gcs_adapter = gcs_factory(experimental_zb_hns_support=True) - # Check if we are running against a real GCS endpoint - is_real_gcs = ( - os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" - ) + with patch("google.auth.default", return_value=(None, "fake-project")): + gcs_adapter = gcs_factory(experimental_zb_hns_support=True) + # Check if we are running against a real GCS endpoint + is_real_gcs = ( + os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + ) try: # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint if not is_real_gcs: diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 6ba2c5c4..e3ae2ce1 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -4,6 +4,7 @@ from datetime import datetime, timezone from itertools import chain from unittest import mock +from unittest.mock import patch from urllib.parse import parse_qs, unquote, urlparse from uuid import uuid4 @@ -1746,8 +1747,9 @@ def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): - gcs = gcs_factory(experimental_zb_hns_support=True) + with patch("google.auth.default", return_value=(None, "fake-project")): + gcs = gcs_factory(experimental_zb_hns_support=True) - assert isinstance( - gcs, GCSFileSystemAdapter - ), "Expected File system instance to be GCSFileSystemAdapter" + assert isinstance( + gcs, GCSFileSystemAdapter + ), "Expected File system instance to be GCSFileSystemAdapter" From 5714012e0c1825ea8133b66cbc41591ace2a6b16 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 19:07:23 +0000 Subject: [PATCH 48/59] fixes conflicting lint rules black and isort --- .isort.cfg | 1 + gcsfs/gcsfs_adapter.py | 3 +-- gcsfs/tests/test_gcsfs_adapter.py | 5 +++-- gcsfs/zb_hns_utils.py | 5 +++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index 1fff95db..1eab763a 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,2 +1,3 @@ [settings] +profile = black known_third_party = aiohttp,click,decorator,fsspec,fuse,google,google_auth_oauthlib,pytest,requests,setuptools diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 6be7229b..59230c25 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -6,8 +6,7 @@ from google.api_core import gapic_v1 from google.api_core.client_info import ClientInfo from google.cloud import storage_control_v2 -from google.cloud.storage._experimental.asyncio.async_grpc_client import \ - AsyncGrpcClient +from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient from . import __version__ as version from . import zb_hns_utils diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 988d7726..b6128789 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -5,8 +5,9 @@ from unittest import mock import pytest -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ - AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( + AsyncMultiRangeDownloader, +) from google.cloud.storage.exceptions import DataCorruption from gcsfs.gcsfs_adapter import BucketType diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py index aa8ff3e0..3ddf2c37 100644 --- a/gcsfs/zb_hns_utils.py +++ b/gcsfs/zb_hns_utils.py @@ -1,7 +1,8 @@ from io import BytesIO -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import \ - AsyncMultiRangeDownloader +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( + AsyncMultiRangeDownloader, +) async def create_mrd(grpc_client, bucket_name, object_name, generation=None): From fccd43a55326dda67e246bab09bb44b7549bba25 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Mon, 3 Nov 2025 19:07:59 +0000 Subject: [PATCH 49/59] adds missing pytest package in conda install --- environment_gcsfs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment_gcsfs.yaml b/environment_gcsfs.yaml index c5a0efba..39db5120 100644 --- a/environment_gcsfs.yaml +++ b/environment_gcsfs.yaml @@ -17,6 +17,7 @@ dependencies: - pytest - pytest-timeout - pytest-asyncio + - pytest-subtests - requests - ujson - pip: From 11dd7221090eb39889e9231ffc023e09034f92fe Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Tue, 4 Nov 2025 13:10:33 +0000 Subject: [PATCH 50/59] refactor get bucket type --- gcsfs/gcsfs_adapter.py | 21 +++++++++++---------- gcsfs/tests/test_gcsfs_adapter.py | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index 59230c25..ee0e8dc3 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -71,7 +71,7 @@ async def _create_control_plane_client(self): credentials=self.credentials.credentials, client_info=client_info ) - async def _get_storage_layout(self, bucket): + async def _get_bucket_type(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] try: @@ -84,22 +84,21 @@ async def _get_storage_layout(self, bucket): ) if response.location_type == "zone": - self._storage_layout_cache[bucket] = BucketType.ZONAL_HIERARCHICAL + return BucketType.ZONAL_HIERARCHICAL else: # This should be updated to include HNS in the future - self._storage_layout_cache[bucket] = BucketType.NON_HIERARCHICAL + return BucketType.NON_HIERARCHICAL except api_exceptions.NotFound: print(f"Error: Bucket {bucket} not found or you lack permissions.") - return None + return BucketType.UNKNOWN except Exception as e: logger.error( f"Could not determine bucket type for bucket name {bucket}: {e}" ) # Default to UNKNOWN - self._storage_layout_cache[bucket] = BucketType.UNKNOWN - return self._storage_layout_cache[bucket] + return BucketType.UNKNOWN - _sync_get_storage_layout = asyn.sync_wrapper(_get_storage_layout) + _sync_get_bucket_type = asyn.sync_wrapper(_get_bucket_type) def _open( self, @@ -119,7 +118,8 @@ def _open( Open a file. """ bucket, _, _ = self.split_path(path) - bucket_type = self._sync_get_storage_layout(bucket) + bucket_type = self._sync_get_bucket_type(bucket) + self._storage_layout_cache[bucket] = bucket_type return gcs_file_types[bucket_type]( self, path, @@ -191,8 +191,9 @@ async def _process_limits_to_offset_and_length(self, path, start, end): ) async def _is_zonal_bucket(self, bucket): - layout = await self._get_storage_layout(bucket) - return layout == BucketType.ZONAL_HIERARCHICAL + bucket_type = await self._get_bucket_type(bucket) + self._storage_layout_cache[bucket] = bucket_type + return bucket_type == BucketType.ZONAL_HIERARCHICAL async def _cat_file(self, path, start=None, end=None, **kwargs): """ diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index b6128789..3b3ccbeb 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -35,10 +35,10 @@ def _zonal_mocks_factory(file_data): yield None return patch_target_get_layout = ( - "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_storage_layout" + "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_bucket_type" ) patch_target_sync_layout = ( - "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_storage_layout" + "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_bucket_type" ) patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" From a2b507702a6b95dd33f6d500ac00adde334026ca Mon Sep 17 00:00:00 2001 From: suni72 <59387263+suni72@users.noreply.github.com> Date: Tue, 4 Nov 2025 21:10:27 +0530 Subject: [PATCH 51/59] Implement Zonal Read Stream Cleanup (#7) * Add mrd closing logic in ZonalFile and GCSFSAdapter Add exception handling in GCSFile fetch_range * Fix gcs_adapter tests to use zonal mocks add test to validate mrd stream is closed with GCSFile closing * FIx: use patch for auth in gcs_adapter fixture for fake gcs only --- gcsfs/gcsfs_adapter.py | 11 +++- gcsfs/tests/conftest.py | 62 +++++++++++++---------- gcsfs/tests/test_gcsfs_adapter.py | 83 ++++++++++++++++++++----------- gcsfs/zonal_file.py | 15 +++++- 4 files changed, 112 insertions(+), 59 deletions(-) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index ee0e8dc3..e8846fd5 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -200,6 +200,7 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): Fetch a file's contents as bytes. """ mrd = kwargs.pop("mrd", None) + mrd_created = False # A new MRD is required when read is done directly by the # GCSFilesystem class without creating a GCSFile object first. @@ -212,8 +213,16 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): mrd = await zb_hns_utils.create_mrd( self.grpc_client, bucket, object_name, generation ) + mrd_created = True offset, length = await self._process_limits_to_offset_and_length( path, start, end ) - return await zb_hns_utils.download_range(offset=offset, length=length, mrd=mrd) + try: + return await zb_hns_utils.download_range( + offset=offset, length=length, mrd=mrd + ) + finally: + # Explicit cleanup if we created the MRD and it has a close method + if mrd_created: + await mrd.close() diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 25eda414..83fd396a 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -2,6 +2,7 @@ import shlex import subprocess import time +from contextlib import nullcontext from unittest.mock import patch import fsspec @@ -127,37 +128,44 @@ def gcs(gcs_factory, populate=True): @pytest.fixture def gcs_adapter(gcs_factory, populate=True): - with patch("google.auth.default", return_value=(None, "fake-project")): + # Check if we are running against a real GCS endpoint + is_real_gcs = ( + os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" + ) + + patch_manager = ( + patch("google.auth.default", return_value=(None, "fake-project")) + if not is_real_gcs + else nullcontext() + ) + + with patch_manager: gcs_adapter = gcs_factory(experimental_zb_hns_support=True) - # Check if we are running against a real GCS endpoint - is_real_gcs = ( - os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" - ) - try: - # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint - if not is_real_gcs: - try: - gcs_adapter.rm(TEST_BUCKET, recursive=True) - except FileNotFoundError: - pass + try: + # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint + if not is_real_gcs: + try: + gcs_adapter.rm(TEST_BUCKET, recursive=True) + except FileNotFoundError: + pass + try: + gcs_adapter.mkdir(TEST_BUCKET) + except Exception: + pass + if populate: + gcs_adapter.pipe( + {TEST_BUCKET + "/" + k: v for k, v in allfiles.items()} + ) + gcs_adapter.invalidate_cache() + yield gcs_adapter + finally: try: - gcs_adapter.mkdir(TEST_BUCKET) + # Only remove the bucket/contents if we are NOT using the real GCS + if not is_real_gcs: + gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) + gcs_adapter.rm(TEST_BUCKET) except Exception: pass - if populate: - gcs_adapter.pipe( - {TEST_BUCKET + "/" + k: v for k, v in allfiles.items()} - ) - gcs_adapter.invalidate_cache() - yield gcs_adapter - finally: - try: - # Only remove the bucket/contents if we are NOT using the real GCS - if not is_real_gcs: - gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) - gcs_adapter.rm(TEST_BUCKET) - except Exception: - pass @pytest.fixture diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_gcsfs_adapter.py index 3b3ccbeb..eca28d62 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_gcsfs_adapter.py @@ -147,10 +147,11 @@ def test_readline_zb(gcs_adapter, zonal_mocks): [files.items(), csv_files.items(), text_files.items()] ) for k, data in all_items: - with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: - result = f.readline() - expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") - assert result == expected + with zonal_mocks(data): + with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: + result = f.readline() + expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") + assert result == expected def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): @@ -158,21 +159,22 @@ def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(a, "wb") as f: f.write(data) - with gcs_adapter.open(a, "rb") as f: - result = f.readline() - assert result == b"a,b\n" - assert f.loc == 4 - assert f.cache.cache == data + with zonal_mocks(data): + with gcs_adapter.open(a, "rb") as f: + result = f.readline() + assert result == b"a,b\n" + assert f.loc == 4 + assert f.cache.cache == data - result = f.readline() - assert result == b"11,22\n" - assert f.loc == 10 - assert f.cache.cache == data + result = f.readline() + assert result == b"11,22\n" + assert f.loc == 10 + assert f.cache.cache == data - result = f.readline() - assert result == b"3,4" - assert f.loc == 13 - assert f.cache.cache == data + result = f.readline() + assert result == b"3,4" + assert f.loc == 13 + assert f.cache.cache == data def test_readline_empty_zb(gcs_adapter, zonal_mocks): @@ -180,9 +182,10 @@ def test_readline_empty_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(b, "wb") as f: f.write(data) - with gcs_adapter.open(b, "rb") as f: - result = f.readline() - assert result == data + with zonal_mocks(data): + with gcs_adapter.open(b, "rb") as f: + result = f.readline() + assert result == data def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): @@ -190,18 +193,19 @@ def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): if not gcs_adapter.on_google: with gcs_adapter.open(c, "wb") as f: f.write(data) - with gcs_adapter.open(c, "rb", block_size=2**18) as f: - result = f.readline() - expected = b"ab\n" - assert result == expected + with zonal_mocks(data): + with gcs_adapter.open(c, "rb", block_size=2**18) as f: + result = f.readline() + expected = b"ab\n" + assert result == expected - result = f.readline() - expected = b"a" * (2**18) + b"\n" - assert result == expected + result = f.readline() + expected = b"a" * (2**18) + b"\n" + assert result == expected - result = f.readline() - expected = b"ab" - assert result == expected + result = f.readline() + expected = b"ab" + assert result == expected @pytest.mark.parametrize( @@ -264,3 +268,22 @@ def test_mrd_exception_handling(gcs_adapter, zonal_mocks, exception_to_raise): gcs_adapter.read_block(file_path, 0, 10) mocks["downloader"].download_ranges.assert_called_once() + + +def test_mrd_stream_cleanup(gcs_adapter, zonal_mocks): + """ + Tests that mrd stream is properly closed with file closure. + """ + with zonal_mocks(json_data) as mocks: + if not gcs_adapter.on_google: + + def close_side_effect(): + mocks["downloader"].is_stream_open = False + + mocks["downloader"].close.side_effect = close_side_effect + + with gcs_adapter.open(file_path, "rb") as f: + assert f.mrd is not None + + assert True is f.closed + assert False is f.mrd.is_stream_open diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index 1118df8b..ed82c22d 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -38,4 +38,17 @@ def _fetch_range(self, start, end): Overrides the default _fetch_range to implement the gRPC read path. """ - return self.gcsfs.cat_file(self.path, start=start, end=end, mrd=self.mrd) + try: + return self.gcsfs.cat_file(self.path, start=start, end=end, mrd=self.mrd) + except RuntimeError as e: + if "not satisfiable" in str(e): + return b"" + raise + + def close(self): + """ + Closes the ZonalFile and the underlying AsyncMultiRangeDownloader. + """ + if self.mrd: + asyn.sync(self.gcsfs.loop, self.mrd.close) + super().close() From 0c3df8efee9fb6575dd26724bce5db6436802c6c Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sun, 9 Nov 2025 09:35:08 +0000 Subject: [PATCH 52/59] adds GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT as env variable instead of kwargs --- gcsfs/core.py | 8 ++++---- gcsfs/gcsfs_adapter.py | 1 - gcsfs/tests/conftest.py | 4 +++- gcsfs/tests/test_core.py | 16 ++++++++++------ 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 4bccecdc..93df68e3 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -287,9 +287,11 @@ def __new__(cls, *args, **kwargs): Factory to return a GCSFileSystemAdapter instance if the experimental flag is enabled. """ - experimental_support = kwargs.pop("experimental_zb_hns_support", False) + experimental_multi_bucket_support = os.environ.get( + "GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false" + ).lower() in ("true", "1") - if experimental_support: + if experimental_multi_bucket_support: from .gcsfs_adapter import GCSFileSystemAdapter return object.__new__(GCSFileSystemAdapter) @@ -315,7 +317,6 @@ def __init__( endpoint_url=None, default_location=None, version_aware=False, - experimental_zb_hns_support=False, **kwargs, ): if cache_timeout is not None: @@ -342,7 +343,6 @@ def __init__( self.session_kwargs = session_kwargs or {} self.default_location = default_location self.version_aware = version_aware - self.experimental_zb_hns_support = experimental_zb_hns_support if check_connection: warnings.warn( diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/gcsfs_adapter.py index e8846fd5..66ce2b0c 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/gcsfs_adapter.py @@ -41,7 +41,6 @@ class GCSFileSystemAdapter(GCSFileSystem): """ def __init__(self, *args, **kwargs): - kwargs.pop("experimental_zb_hns_support", None) super().__init__(*args, **kwargs) self.grpc_client = None self.storage_control_client = None diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 83fd396a..5dd8fe13 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -140,7 +140,8 @@ def gcs_adapter(gcs_factory, populate=True): ) with patch_manager: - gcs_adapter = gcs_factory(experimental_zb_hns_support=True) + os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" + gcs_adapter = gcs_factory() try: # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint if not is_real_gcs: @@ -159,6 +160,7 @@ def gcs_adapter(gcs_factory, populate=True): gcs_adapter.invalidate_cache() yield gcs_adapter finally: + del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] try: # Only remove the bucket/contents if we are NOT using the real GCS if not is_real_gcs: diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index e3ae2ce1..18c6e314 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1735,7 +1735,7 @@ def test_get_error(gcs): gcs.get_file(f"{TEST_BUCKET}/doesnotexist", "other") -def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory): +def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_present(gcs_factory): gcs = gcs_factory() assert isinstance( @@ -1747,9 +1747,13 @@ def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_passed(gcs_factory def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): - with patch("google.auth.default", return_value=(None, "fake-project")): - gcs = gcs_factory(experimental_zb_hns_support=True) + try: + with patch("google.auth.default", return_value=(None, "fake-project")): + os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" + gcs = gcs_factory() - assert isinstance( - gcs, GCSFileSystemAdapter - ), "Expected File system instance to be GCSFileSystemAdapter" + assert isinstance( + gcs, GCSFileSystemAdapter + ), "Expected File system instance to be GCSFileSystemAdapter" + finally: + del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] From 0e9c2ab65172a155fd8853c13523d6ee7a25a4bd Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Sun, 9 Nov 2025 12:20:03 +0000 Subject: [PATCH 53/59] removes timeout from pytest fixture coming as mark has no effect on test fixtures and pytest treats this as error on ci --- gcsfs/tests/test_fuse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gcsfs/tests/test_fuse.py b/gcsfs/tests/test_fuse.py index 3c3b0c3b..97b2ee10 100644 --- a/gcsfs/tests/test_fuse.py +++ b/gcsfs/tests/test_fuse.py @@ -11,7 +11,6 @@ from gcsfs.tests.settings import TEST_BUCKET -@pytest.mark.timeout(180) @pytest.fixture def fsspec_fuse_run(): """Fixture catches other errors on fuse import.""" From c3ad9b24d5719513a4ae831e366b33b4936916f5 Mon Sep 17 00:00:00 2001 From: suni72 <59387263+suni72@users.noreply.github.com> Date: Wed, 12 Nov 2025 10:29:02 +0530 Subject: [PATCH 54/59] Refactor: Rename GcsFileSystemAdapter to ExtendedGcsFileSystem & Fix Test Mocking (#8) * Add mrd closing logic in ZonalFile and GCSFSAdapter Add exception handling in GCSFile fetch_range * Fix gcs_adapter tests to use zonal mocks add test to validate mrd stream is closed with GCSFile closing * FIx: use patch for auth in gcs_adapter fixture for fake gcs only * Change imports to absolute imports * Rename GcsFileSystemAdapter to ExtendedGcsFileSystem * Mock _sync_get_bucket_type call to return UNKNOWN while opening a file in write mode * Updated ExtendedGcsFileSystem class description Separated cleanup logic from extended_gcsfs fixture for better readability --- gcsfs/core.py | 6 +- gcsfs/{gcsfs_adapter.py => extended_gcsfs.py} | 17 ++-- gcsfs/tests/conftest.py | 42 ++++++---- gcsfs/tests/test_core.py | 10 +-- ...csfs_adapter.py => test_extended_gcsfs.py} | 81 +++++++++++-------- gcsfs/zonal_file.py | 4 +- 6 files changed, 90 insertions(+), 70 deletions(-) rename gcsfs/{gcsfs_adapter.py => extended_gcsfs.py} (92%) rename gcsfs/tests/{test_gcsfs_adapter.py => test_extended_gcsfs.py} (77%) diff --git a/gcsfs/core.py b/gcsfs/core.py index 93df68e3..ee61f5ce 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -284,7 +284,7 @@ class GCSFileSystem(asyn.AsyncFileSystem): def __new__(cls, *args, **kwargs): """ - Factory to return a GCSFileSystemAdapter instance if the experimental + Factory to return a ExtendedGcsFileSystem instance if the experimental flag is enabled. """ experimental_multi_bucket_support = os.environ.get( @@ -292,9 +292,9 @@ def __new__(cls, *args, **kwargs): ).lower() in ("true", "1") if experimental_multi_bucket_support: - from .gcsfs_adapter import GCSFileSystemAdapter + from .extended_gcsfs import ExtendedGcsFileSystem - return object.__new__(GCSFileSystemAdapter) + return object.__new__(ExtendedGcsFileSystem) else: return object.__new__(cls) diff --git a/gcsfs/gcsfs_adapter.py b/gcsfs/extended_gcsfs.py similarity index 92% rename from gcsfs/gcsfs_adapter.py rename to gcsfs/extended_gcsfs.py index 66ce2b0c..54f9229b 100644 --- a/gcsfs/gcsfs_adapter.py +++ b/gcsfs/extended_gcsfs.py @@ -8,10 +8,10 @@ from google.cloud import storage_control_v2 from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient -from . import __version__ as version -from . import zb_hns_utils -from .core import GCSFile, GCSFileSystem -from .zonal_file import ZonalFile +from gcsfs import __version__ as version +from gcsfs import zb_hns_utils +from gcsfs.core import GCSFile, GCSFileSystem +from gcsfs.zonal_file import ZonalFile logger = logging.getLogger("gcsfs") @@ -33,11 +33,12 @@ class BucketType(Enum): } -class GCSFileSystemAdapter(GCSFileSystem): +class ExtendedGcsFileSystem(GCSFileSystem): """ - This class will be used when experimental_zb_hns_support is set to true for all bucket types. - GCSFileSystemAdapter is a subclass of GCSFileSystem that adds specialized - logic to support Zonal and Hierarchical buckets. + This class will be used when GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT env variable is set to true. + ExtendedGcsFileSystem is a subclass of GCSFileSystem that adds new logic for bucket types + including zonal and hierarchical. For buckets without special properties, it forwards requests + to the parent class GCSFileSystem for default processing. """ def __init__(self, *args, **kwargs): diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index 5dd8fe13..cf3b659d 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -1,3 +1,4 @@ +import logging import os import shlex import subprocess @@ -126,48 +127,55 @@ def gcs(gcs_factory, populate=True): pass +def _cleanup_gcs(gcs, is_real_gcs): + """Only remove the bucket/contents if we are NOT using the real GCS, logging a warning on failure.""" + del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] + if is_real_gcs: + return + try: + gcs.rm(gcs.find(TEST_BUCKET), recursive=True) + gcs.rm(TEST_BUCKET) + except Exception as e: + logging.warning(f"Failed to clean up GCS bucket {TEST_BUCKET}: {e}") + + @pytest.fixture -def gcs_adapter(gcs_factory, populate=True): +def extended_gcsfs(gcs_factory, populate=True): # Check if we are running against a real GCS endpoint is_real_gcs = ( os.environ.get("STORAGE_EMULATOR_HOST") == "https://storage.googleapis.com" ) - patch_manager = ( + # Mock authentication if not using a real GCS endpoint, + # since grpc client in extended_gcsfs does not work with anon access + mock_authentication_manager = ( patch("google.auth.default", return_value=(None, "fake-project")) if not is_real_gcs else nullcontext() ) - with patch_manager: + with mock_authentication_manager: os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" - gcs_adapter = gcs_factory() + extended_gcsfs = gcs_factory() try: # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint if not is_real_gcs: try: - gcs_adapter.rm(TEST_BUCKET, recursive=True) + extended_gcsfs.rm(TEST_BUCKET, recursive=True) except FileNotFoundError: pass try: - gcs_adapter.mkdir(TEST_BUCKET) + extended_gcsfs.mkdir(TEST_BUCKET) except Exception: pass if populate: - gcs_adapter.pipe( + extended_gcsfs.pipe( {TEST_BUCKET + "/" + k: v for k, v in allfiles.items()} ) - gcs_adapter.invalidate_cache() - yield gcs_adapter + extended_gcsfs.invalidate_cache() + yield extended_gcsfs finally: - del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] - try: - # Only remove the bucket/contents if we are NOT using the real GCS - if not is_real_gcs: - gcs_adapter.rm(gcs_adapter.find(TEST_BUCKET), recursive=True) - gcs_adapter.rm(TEST_BUCKET) - except Exception: - pass + _cleanup_gcs(extended_gcsfs, is_real_gcs) @pytest.fixture diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 18c6e314..66944247 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -19,7 +19,7 @@ from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials -from gcsfs.gcsfs_adapter import GCSFileSystemAdapter +from gcsfs.extended_gcsfs import ExtendedGcsFileSystem from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files from gcsfs.tests.utils import tempdir, tmpfile @@ -1742,18 +1742,18 @@ def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_present(gcs_factor gcs, gcsfs.GCSFileSystem ), "Expected File system instance to be GCSFileSystem" assert not isinstance( - gcs, GCSFileSystemAdapter + gcs, ExtendedGcsFileSystem ), "Expected File system instance to be GCSFileSystem" -def test_gcs_filesystem_adapter_when_experimental_zonal_toggle_is_true(gcs_factory): +def test_extended_gcs_filesystem_when_experimental_zonal_toggle_is_true(gcs_factory): try: with patch("google.auth.default", return_value=(None, "fake-project")): os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" gcs = gcs_factory() assert isinstance( - gcs, GCSFileSystemAdapter - ), "Expected File system instance to be GCSFileSystemAdapter" + gcs, ExtendedGcsFileSystem + ), "Expected File system instance to be ExtendedGcsFileSystem" finally: del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] diff --git a/gcsfs/tests/test_gcsfs_adapter.py b/gcsfs/tests/test_extended_gcsfs.py similarity index 77% rename from gcsfs/tests/test_gcsfs_adapter.py rename to gcsfs/tests/test_extended_gcsfs.py index eca28d62..a7737bb6 100644 --- a/gcsfs/tests/test_gcsfs_adapter.py +++ b/gcsfs/tests/test_extended_gcsfs.py @@ -10,7 +10,7 @@ ) from google.cloud.storage.exceptions import DataCorruption -from gcsfs.gcsfs_adapter import BucketType +from gcsfs.extended_gcsfs import BucketType from gcsfs.tests.conftest import a, b, c, csv_files, files, text_files from gcsfs.tests.settings import TEST_BUCKET @@ -35,12 +35,12 @@ def _zonal_mocks_factory(file_data): yield None return patch_target_get_layout = ( - "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._get_bucket_type" + "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._get_bucket_type" ) patch_target_sync_layout = ( - "gcsfs.gcsfs_adapter.GCSFileSystemAdapter._sync_get_bucket_type" + "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._sync_get_bucket_type" ) - patch_target_create_mrd = "gcsfs.gcsfs_adapter.zb_hns_utils.create_mrd" + patch_target_create_mrd = "gcsfs.extended_gcsfs.zb_hns_utils.create_mrd" patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" async def download_side_effect(read_requests, **kwargs): @@ -100,14 +100,14 @@ async def download_side_effect(read_requests, **kwargs): ] -def test_read_block_zb(gcs_adapter, zonal_mocks, subtests): +def test_read_block_zb(extended_gcsfs, zonal_mocks, subtests): for param in read_block_params: with subtests.test(id=param.id): offset, length, delimiter, expected_data = param.values path = file_path with zonal_mocks(json_data) as mocks: - result = gcs_adapter.read_block(path, offset, length, delimiter) + result = extended_gcsfs.read_block(path, offset, length, delimiter) assert result == expected_data if mocks: @@ -120,13 +120,13 @@ def test_read_block_zb(gcs_adapter, zonal_mocks, subtests): mocks["downloader"].download_ranges.assert_not_called() -def test_read_small_zb(gcs_adapter, zonal_mocks): +def test_read_small_zb(extended_gcsfs, zonal_mocks): csv_file = "2014-01-01.csv" csv_file_path = f"{TEST_BUCKET}/{csv_file}" csv_data = csv_files[csv_file] with zonal_mocks(csv_data) as mocks: - with gcs_adapter.open(csv_file_path, "rb", block_size=10) as f: + with extended_gcsfs.open(csv_file_path, "rb", block_size=10) as f: out = [] i = 1 while True: @@ -135,32 +135,35 @@ def test_read_small_zb(gcs_adapter, zonal_mocks): if data == b"": break out.append(data) - assert gcs_adapter.cat(csv_file_path) == b"".join(out) + assert extended_gcsfs.cat(csv_file_path) == b"".join(out) # cache drop assert len(f.cache.cache) < len(out) if mocks: mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) -def test_readline_zb(gcs_adapter, zonal_mocks): +def test_readline_zb(extended_gcsfs, zonal_mocks): all_items = chain.from_iterable( [files.items(), csv_files.items(), text_files.items()] ) for k, data in all_items: with zonal_mocks(data): - with gcs_adapter.open("/".join([TEST_BUCKET, k]), "rb") as f: + with extended_gcsfs.open("/".join([TEST_BUCKET, k]), "rb") as f: result = f.readline() expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") assert result == expected -def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): +def test_readline_from_cache_zb(extended_gcsfs, zonal_mocks): data = b"a,b\n11,22\n3,4" - if not gcs_adapter.on_google: - with gcs_adapter.open(a, "wb") as f: - f.write(data) + if not extended_gcsfs.on_google: + with mock.patch.object( + extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + ): + with extended_gcsfs.open(a, "wb") as f: + f.write(data) with zonal_mocks(data): - with gcs_adapter.open(a, "rb") as f: + with extended_gcsfs.open(a, "rb") as f: result = f.readline() assert result == b"a,b\n" assert f.loc == 4 @@ -177,24 +180,30 @@ def test_readline_from_cache_zb(gcs_adapter, zonal_mocks): assert f.cache.cache == data -def test_readline_empty_zb(gcs_adapter, zonal_mocks): +def test_readline_empty_zb(extended_gcsfs, zonal_mocks): data = b"" - if not gcs_adapter.on_google: - with gcs_adapter.open(b, "wb") as f: - f.write(data) + if not extended_gcsfs.on_google: + with mock.patch.object( + extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + ): + with extended_gcsfs.open(b, "wb") as f: + f.write(data) with zonal_mocks(data): - with gcs_adapter.open(b, "rb") as f: + with extended_gcsfs.open(b, "rb") as f: result = f.readline() assert result == data -def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): +def test_readline_blocksize_zb(extended_gcsfs, zonal_mocks): data = b"ab\n" + b"a" * (2**18) + b"\nab" - if not gcs_adapter.on_google: - with gcs_adapter.open(c, "wb") as f: - f.write(data) + if not extended_gcsfs.on_google: + with mock.patch.object( + extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + ): + with extended_gcsfs.open(c, "wb") as f: + f.write(data) with zonal_mocks(data): - with gcs_adapter.open(c, "rb", block_size=2**18) as f: + with extended_gcsfs.open(c, "rb", block_size=2**18) as f: result = f.readline() expected = b"ab\n" assert result == expected @@ -228,13 +237,15 @@ def test_readline_blocksize_zb(gcs_adapter, zonal_mocks): ], ) def test_process_limits_parametrized( - gcs_adapter, start, end, exp_offset, exp_length, exp_exc + extended_gcsfs, start, end, exp_offset, exp_length, exp_exc ): if exp_exc is not None: with pytest.raises(exp_exc): - gcs_adapter.sync_process_limits_to_offset_and_length(file_path, start, end) + extended_gcsfs.sync_process_limits_to_offset_and_length( + file_path, start, end + ) else: - offset, length = gcs_adapter.sync_process_limits_to_offset_and_length( + offset, length = extended_gcsfs.sync_process_limits_to_offset_and_length( file_path, start, end ) assert offset == exp_offset @@ -245,12 +256,12 @@ def test_process_limits_parametrized( "exception_to_raise", [ValueError, DataCorruption, Exception], ) -def test_mrd_exception_handling(gcs_adapter, zonal_mocks, exception_to_raise): +def test_mrd_exception_handling(extended_gcsfs, zonal_mocks, exception_to_raise): """ Tests that _cat_file correctly propagates exceptions from mrd.download_ranges. """ with zonal_mocks(json_data) as mocks: - if gcs_adapter.on_google: + if extended_gcsfs.on_google: pytest.skip("Cannot mock exceptions on real GCS") # Configure the mock to raise a specified exception @@ -265,24 +276,24 @@ def test_mrd_exception_handling(gcs_adapter, zonal_mocks, exception_to_raise): ) with pytest.raises(exception_to_raise, match="Test exception raised"): - gcs_adapter.read_block(file_path, 0, 10) + extended_gcsfs.read_block(file_path, 0, 10) mocks["downloader"].download_ranges.assert_called_once() -def test_mrd_stream_cleanup(gcs_adapter, zonal_mocks): +def test_mrd_stream_cleanup(extended_gcsfs, zonal_mocks): """ Tests that mrd stream is properly closed with file closure. """ with zonal_mocks(json_data) as mocks: - if not gcs_adapter.on_google: + if not extended_gcsfs.on_google: def close_side_effect(): mocks["downloader"].is_stream_open = False mocks["downloader"].close.side_effect = close_side_effect - with gcs_adapter.open(file_path, "rb") as f: + with extended_gcsfs.open(file_path, "rb") as f: assert f.mrd is not None assert True is f.closed diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index ed82c22d..5e37c9c1 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -1,7 +1,7 @@ from fsspec import asyn -from . import zb_hns_utils -from .core import GCSFile +from gcsfs import zb_hns_utils +from gcsfs.core import GCSFile class ZonalFile(GCSFile): From a8945c23faa286c359e1e095e826a28a30af93a9 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Wed, 12 Nov 2025 20:33:17 +0530 Subject: [PATCH 55/59] Replaces __new__ with conditional import in init * registers ExtendedGCSFileSystem when experimental variable is set instead of __new__ hack * fixes registry race condition * removes unnecessary registry * adds separate ci tests * adds test_init.py * fixes pre-commit --- .github/workflows/ci.yml | 13 ++++++-- gcsfs/__init__.py | 17 ++++++++++ gcsfs/core.py | 16 ---------- gcsfs/tests/test_core.py | 26 ---------------- gcsfs/tests/test_init.py | 67 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 44 deletions(-) create mode 100644 gcsfs/tests/test_init.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 59943c45..da3df34b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,13 +35,22 @@ jobs: - name: install run: | pip install -e . - - name: Run tests + - name: Run Standard Tests run: | export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json pytest -vv -s \ --log-format="%(asctime)s %(levelname)s %(message)s" \ --log-date-format="%H:%M:%S" \ - gcsfs/ + gcsfs/ \ + --ignore=gcsfs/tests/test_extended_gcsfs.py + - name: Run Extended Tests + run: | + export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json + export GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT="true" + pytest -vv -s \ + --log-format="%(asctime)s %(levelname)s %(message)s" \ + --log-date-format="%H:%M:%S" \ + gcsfs/tests/test_extended_gcsfs.py lint: name: lint diff --git a/gcsfs/__init__.py b/gcsfs/__init__.py index fffbca44..b47e3bb9 100644 --- a/gcsfs/__init__.py +++ b/gcsfs/__init__.py @@ -1,10 +1,27 @@ +import logging +import os + from ._version import get_versions +logger = logging.getLogger(__name__) __version__ = get_versions()["version"] del get_versions from .core import GCSFileSystem from .mapping import GCSMap +if os.getenv("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false").lower() in ("true", "1"): + try: + from .extended_gcsfs import ExtendedGcsFileSystem as GCSFileSystem + + logger.info( + "gcsfs experimental features enabled via GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT." + ) + except ImportError as e: + logger.warning( + f"GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT is set, but failed to import experimental features: {e}" + ) + # Fallback to core GCSFileSystem, do not register here + __all__ = ["GCSFileSystem", "GCSMap"] from . import _version diff --git a/gcsfs/core.py b/gcsfs/core.py index ee61f5ce..18b4eb3e 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -282,22 +282,6 @@ class GCSFileSystem(asyn.AsyncFileSystem): protocol = "gs", "gcs" async_impl = True - def __new__(cls, *args, **kwargs): - """ - Factory to return a ExtendedGcsFileSystem instance if the experimental - flag is enabled. - """ - experimental_multi_bucket_support = os.environ.get( - "GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false" - ).lower() in ("true", "1") - - if experimental_multi_bucket_support: - from .extended_gcsfs import ExtendedGcsFileSystem - - return object.__new__(ExtendedGcsFileSystem) - else: - return object.__new__(cls) - def __init__( self, project=DEFAULT_PROJECT, diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 66944247..a93dbd4c 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -4,7 +4,6 @@ from datetime import datetime, timezone from itertools import chain from unittest import mock -from unittest.mock import patch from urllib.parse import parse_qs, unquote, urlparse from uuid import uuid4 @@ -19,7 +18,6 @@ from gcsfs import __version__ as version from gcsfs.core import GCSFileSystem, quote from gcsfs.credentials import GoogleCredentials -from gcsfs.extended_gcsfs import ExtendedGcsFileSystem from gcsfs.tests.conftest import a, allfiles, b, csv_files, files, text_files from gcsfs.tests.utils import tempdir, tmpfile @@ -1733,27 +1731,3 @@ def test_near_find(gcs): def test_get_error(gcs): with pytest.raises(FileNotFoundError): gcs.get_file(f"{TEST_BUCKET}/doesnotexist", "other") - - -def test_gcs_filesystem_when_experimental_zonal_toggle_is_not_present(gcs_factory): - gcs = gcs_factory() - - assert isinstance( - gcs, gcsfs.GCSFileSystem - ), "Expected File system instance to be GCSFileSystem" - assert not isinstance( - gcs, ExtendedGcsFileSystem - ), "Expected File system instance to be GCSFileSystem" - - -def test_extended_gcs_filesystem_when_experimental_zonal_toggle_is_true(gcs_factory): - try: - with patch("google.auth.default", return_value=(None, "fake-project")): - os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" - gcs = gcs_factory() - - assert isinstance( - gcs, ExtendedGcsFileSystem - ), "Expected File system instance to be ExtendedGcsFileSystem" - finally: - del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] diff --git a/gcsfs/tests/test_init.py b/gcsfs/tests/test_init.py new file mode 100644 index 00000000..4619de4f --- /dev/null +++ b/gcsfs/tests/test_init.py @@ -0,0 +1,67 @@ +import os +import sys + + +class TestConditionalImport: + def setup_method(self, method): + """Setup for each test method.""" + self.original_env = os.environ.get("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT") + + # Snapshot original gcsfs modules + self.original_modules = { + name: mod for name, mod in sys.modules.items() if name.startswith("gcsfs") + } + + # Unload gcsfs modules to force re-import during the test + modules_to_remove = list(self.original_modules.keys()) + for name in modules_to_remove: + if name in sys.modules: + del sys.modules[name] + + def teardown_method(self, method): + """Teardown after each test method.""" + # Reset environment variable to its original state + if self.original_env is not None: + os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = self.original_env + elif "GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT" in os.environ: + del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] + + # Clear any gcsfs modules loaded/modified during this test + modules_to_remove = [name for name in sys.modules if name.startswith("gcsfs")] + for name in modules_to_remove: + if name in sys.modules: + del sys.modules[name] + + # Restore the original gcsfs modules from the snapshot to avoid side effect + # affecting other tests + sys.modules.update(self.original_modules) + + def test_experimental_env_unset(self): + """ + Tests gcsfs.GCSFileSystem is core.GCSFileSystem when + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT is NOT set. + """ + if "GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT" in os.environ: + del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] + + import gcsfs + + assert ( + gcsfs.GCSFileSystem is gcsfs.core.GCSFileSystem + ), "Should be core.GCSFileSystem" + assert not hasattr( + gcsfs, "ExtendedGcsFileSystem" + ), "ExtendedGcsFileSystem should not be imported directly on gcsfs" + + def test_experimental_env_set(self): + """ + Tests gcsfs.GCSFileSystem is extended_gcsfs.ExtendedGcsFileSystem when + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT IS set. + """ + os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" + + import gcsfs + + assert ( + gcsfs.GCSFileSystem is gcsfs.extended_gcsfs.ExtendedGcsFileSystem + ), "Should be ExtendedGcsFileSystem" From 90d0cf4e6435e3fba7afe10095415e7f414f5624 Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Fri, 14 Nov 2025 11:28:54 +0000 Subject: [PATCH 56/59] fixes lint errors --- gcsfs/tests/test_extended_gcsfs.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/gcsfs/tests/test_extended_gcsfs.py b/gcsfs/tests/test_extended_gcsfs.py index a7737bb6..e707a444 100644 --- a/gcsfs/tests/test_extended_gcsfs.py +++ b/gcsfs/tests/test_extended_gcsfs.py @@ -58,15 +58,18 @@ async def download_side_effect(read_requests, **kwargs): ) mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) - with mock.patch( - patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL - ) as mock_sync_layout, mock.patch( - patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL - ), mock.patch( - patch_target_create_mrd, mock_create_mrd - ), mock.patch( - patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock - ) as mock_cat_file: + with ( + mock.patch( + patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL + ) as mock_sync_layout, + mock.patch( + patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL + ), + mock.patch(patch_target_create_mrd, mock_create_mrd), + mock.patch( + patch_target_gcsfs_cat_file, new_callable=mock.AsyncMock + ) as mock_cat_file, + ): mocks = { "sync_layout": mock_sync_layout, "create_mrd": mock_create_mrd, From 8672c2802e577b06492b07c841bf75b177349473 Mon Sep 17 00:00:00 2001 From: suni72 <59387263+suni72@users.noreply.github.com> Date: Sat, 15 Nov 2025 11:59:02 +0530 Subject: [PATCH 57/59] simplified logic in cleanup_gcs for unit tests removed redundant environment variable setting removed try block for creating TEST_BUCKET to expose errors in test setup --- gcsfs/tests/conftest.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/gcsfs/tests/conftest.py b/gcsfs/tests/conftest.py index cf3b659d..5d5e0179 100644 --- a/gcsfs/tests/conftest.py +++ b/gcsfs/tests/conftest.py @@ -129,12 +129,10 @@ def gcs(gcs_factory, populate=True): def _cleanup_gcs(gcs, is_real_gcs): """Only remove the bucket/contents if we are NOT using the real GCS, logging a warning on failure.""" - del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] if is_real_gcs: return try: - gcs.rm(gcs.find(TEST_BUCKET), recursive=True) - gcs.rm(TEST_BUCKET) + gcs.rm(TEST_BUCKET, recursive=True) except Exception as e: logging.warning(f"Failed to clean up GCS bucket {TEST_BUCKET}: {e}") @@ -155,7 +153,6 @@ def extended_gcsfs(gcs_factory, populate=True): ) with mock_authentication_manager: - os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" extended_gcsfs = gcs_factory() try: # Only create/delete/populate the bucket if we are NOT using the real GCS endpoint @@ -164,10 +161,7 @@ def extended_gcsfs(gcs_factory, populate=True): extended_gcsfs.rm(TEST_BUCKET, recursive=True) except FileNotFoundError: pass - try: - extended_gcsfs.mkdir(TEST_BUCKET) - except Exception: - pass + extended_gcsfs.mkdir(TEST_BUCKET) if populate: extended_gcsfs.pipe( {TEST_BUCKET + "/" + k: v for k, v in allfiles.items()} From 5398cfe48eefb8f4b1aaacaf6889c1ed52db82ad Mon Sep 17 00:00:00 2001 From: Ankita Luthra Date: Tue, 18 Nov 2025 12:08:27 +0000 Subject: [PATCH 58/59] refactored get_bucket_type --- gcsfs/extended_gcsfs.py | 27 +++++++++++++++------------ gcsfs/tests/test_extended_gcsfs.py | 30 +++++++++++++++++------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/gcsfs/extended_gcsfs.py b/gcsfs/extended_gcsfs.py index 54f9229b..7c0c58f4 100644 --- a/gcsfs/extended_gcsfs.py +++ b/gcsfs/extended_gcsfs.py @@ -71,14 +71,21 @@ async def _create_control_plane_client(self): credentials=self.credentials.credentials, client_info=client_info ) - async def _get_bucket_type(self, bucket): + async def _lookup_bucket_type(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] - try: + bucket_type = await self._get_bucket_type(bucket) + # Dont cache UNKNOWN type + if bucket_type == BucketType.UNKNOWN: + return BucketType.UNKNOWN + self._storage_layout_cache[bucket] = bucket_type + return self._storage_layout_cache[bucket] + + _sync_lookup_bucket_type = asyn.sync_wrapper(_lookup_bucket_type) - # Bucket name details + async def _get_bucket_type(self, bucket): + try: bucket_name_value = f"projects/_/buckets/{bucket}/storageLayout" - # Make the request to get bucket type response = await self._storage_control_client.get_storage_layout( name=bucket_name_value ) @@ -89,17 +96,15 @@ async def _get_bucket_type(self, bucket): # This should be updated to include HNS in the future return BucketType.NON_HIERARCHICAL except api_exceptions.NotFound: - print(f"Error: Bucket {bucket} not found or you lack permissions.") + logger.warning(f"Error: Bucket {bucket} not found or you lack permissions.") return BucketType.UNKNOWN except Exception as e: logger.error( f"Could not determine bucket type for bucket name {bucket}: {e}" ) - # Default to UNKNOWN + # Default to UNKNOWN in case bucket type is not obtained return BucketType.UNKNOWN - _sync_get_bucket_type = asyn.sync_wrapper(_get_bucket_type) - def _open( self, path, @@ -118,8 +123,7 @@ def _open( Open a file. """ bucket, _, _ = self.split_path(path) - bucket_type = self._sync_get_bucket_type(bucket) - self._storage_layout_cache[bucket] = bucket_type + bucket_type = self._sync_lookup_bucket_type(bucket) return gcs_file_types[bucket_type]( self, path, @@ -191,8 +195,7 @@ async def _process_limits_to_offset_and_length(self, path, start, end): ) async def _is_zonal_bucket(self, bucket): - bucket_type = await self._get_bucket_type(bucket) - self._storage_layout_cache[bucket] = bucket_type + bucket_type = await self._lookup_bucket_type(bucket) return bucket_type == BucketType.ZONAL_HIERARCHICAL async def _cat_file(self, path, start=None, end=None, **kwargs): diff --git a/gcsfs/tests/test_extended_gcsfs.py b/gcsfs/tests/test_extended_gcsfs.py index e707a444..2e160826 100644 --- a/gcsfs/tests/test_extended_gcsfs.py +++ b/gcsfs/tests/test_extended_gcsfs.py @@ -34,11 +34,11 @@ def _zonal_mocks_factory(file_data): if is_real_gcs: yield None return - patch_target_get_layout = ( - "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._get_bucket_type" + patch_target_lookup_bucket_type = ( + "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._lookup_bucket_type" ) - patch_target_sync_layout = ( - "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._sync_get_bucket_type" + patch_target_sync_lookup_bucket_type = ( + "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._sync_lookup_bucket_type" ) patch_target_create_mrd = "gcsfs.extended_gcsfs.zb_hns_utils.create_mrd" patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" @@ -60,10 +60,12 @@ async def download_side_effect(read_requests, **kwargs): mock_create_mrd = mock.AsyncMock(return_value=mock_downloader) with ( mock.patch( - patch_target_sync_layout, return_value=BucketType.ZONAL_HIERARCHICAL - ) as mock_sync_layout, + patch_target_sync_lookup_bucket_type, + return_value=BucketType.ZONAL_HIERARCHICAL, + ) as mock_sync_lookup_bucket_type, mock.patch( - patch_target_get_layout, return_value=BucketType.ZONAL_HIERARCHICAL + patch_target_lookup_bucket_type, + return_value=BucketType.ZONAL_HIERARCHICAL, ), mock.patch(patch_target_create_mrd, mock_create_mrd), mock.patch( @@ -71,7 +73,7 @@ async def download_side_effect(read_requests, **kwargs): ) as mock_cat_file, ): mocks = { - "sync_layout": mock_sync_layout, + "sync_lookup_bucket_type": mock_sync_lookup_bucket_type, "create_mrd": mock_create_mrd, "downloader": mock_downloader, "cat_file": mock_cat_file, @@ -114,7 +116,9 @@ def test_read_block_zb(extended_gcsfs, zonal_mocks, subtests): assert result == expected_data if mocks: - mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + mocks["sync_lookup_bucket_type"].assert_called_once_with( + TEST_BUCKET + ) if expected_data: mocks["downloader"].download_ranges.assert_called_with( [(offset, mock.ANY, mock.ANY)] @@ -142,7 +146,7 @@ def test_read_small_zb(extended_gcsfs, zonal_mocks): # cache drop assert len(f.cache.cache) < len(out) if mocks: - mocks["sync_layout"].assert_called_once_with(TEST_BUCKET) + mocks["sync_lookup_bucket_type"].assert_called_once_with(TEST_BUCKET) def test_readline_zb(extended_gcsfs, zonal_mocks): @@ -161,7 +165,7 @@ def test_readline_from_cache_zb(extended_gcsfs, zonal_mocks): data = b"a,b\n11,22\n3,4" if not extended_gcsfs.on_google: with mock.patch.object( - extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + extended_gcsfs, "_sync_lookup_bucket_type", return_value=BucketType.UNKNOWN ): with extended_gcsfs.open(a, "wb") as f: f.write(data) @@ -187,7 +191,7 @@ def test_readline_empty_zb(extended_gcsfs, zonal_mocks): data = b"" if not extended_gcsfs.on_google: with mock.patch.object( - extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + extended_gcsfs, "_sync_lookup_bucket_type", return_value=BucketType.UNKNOWN ): with extended_gcsfs.open(b, "wb") as f: f.write(data) @@ -201,7 +205,7 @@ def test_readline_blocksize_zb(extended_gcsfs, zonal_mocks): data = b"ab\n" + b"a" * (2**18) + b"\nab" if not extended_gcsfs.on_google: with mock.patch.object( - extended_gcsfs, "_sync_get_bucket_type", return_value=BucketType.UNKNOWN + extended_gcsfs, "_sync_lookup_bucket_type", return_value=BucketType.UNKNOWN ): with extended_gcsfs.open(c, "wb") as f: f.write(data) From 17964552f1fa4d4f4221dfca40cfb4c51436a4d1 Mon Sep 17 00:00:00 2001 From: suni72 <59387263+suni72@users.noreply.github.com> Date: Tue, 18 Nov 2025 20:34:32 +0530 Subject: [PATCH 59/59] Refactor: Zonal Read Logic and Remove Redundant Utilities * remove redundant create mrd method from zb_hns_utils * Added mrd to _cat_file method signature and added description * Optimized _process_limits_to_offset_and_length method to only call info when size is None. * Add todo to update GCSMap to use correct implementation --- gcsfs/__init__.py | 2 ++ gcsfs/extended_gcsfs.py | 36 +++++++++++++++++++++--------- gcsfs/tests/test_extended_gcsfs.py | 5 ++++- gcsfs/tests/test_zb_hns_utils.py | 27 ---------------------- gcsfs/zb_hns_utils.py | 14 ------------ gcsfs/zonal_file.py | 6 +++-- 6 files changed, 35 insertions(+), 55 deletions(-) diff --git a/gcsfs/__init__.py b/gcsfs/__init__.py index b47e3bb9..f22d6ba1 100644 --- a/gcsfs/__init__.py +++ b/gcsfs/__init__.py @@ -22,6 +22,8 @@ ) # Fallback to core GCSFileSystem, do not register here +# TODO: GCSMap still refers to the original GCSFileSystem. This will be +# addressed in a future update. __all__ = ["GCSFileSystem", "GCSMap"] from . import _version diff --git a/gcsfs/extended_gcsfs.py b/gcsfs/extended_gcsfs.py index 7c0c58f4..f4c643a4 100644 --- a/gcsfs/extended_gcsfs.py +++ b/gcsfs/extended_gcsfs.py @@ -7,6 +7,9 @@ from google.api_core.client_info import ClientInfo from google.cloud import storage_control_v2 from google.cloud.storage._experimental.asyncio.async_grpc_client import AsyncGrpcClient +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( + AsyncMultiRangeDownloader, +) from gcsfs import __version__ as version from gcsfs import zb_hns_utils @@ -160,21 +163,20 @@ async def _process_limits_to_offset_and_length(self, path, start, end): if start is None: offset = 0 elif start < 0: - size = size or (await self._info(path))["size"] + size = (await self._info(path))["size"] if size is None else size offset = size + start else: offset = start if end is None: - size = size or (await self._info(path))["size"] + size = (await self._info(path))["size"] if size is None else size effective_end = size elif end < 0: - size = size or (await self._info(path))["size"] + size = (await self._info(path))["size"] if size is None else size effective_end = size + end else: effective_end = end - size = size or (await self._info(path))["size"] if offset < 0: raise ValueError(f"Calculated start offset ({offset}) cannot be negative.") if effective_end < offset: @@ -183,10 +185,11 @@ async def _process_limits_to_offset_and_length(self, path, start, end): ) elif effective_end == offset: length = 0 # Handle zero-length slice - elif effective_end > size: - length = max(0, size - offset) # Clamp and ensure non-negative else: length = effective_end - offset # Normal case + size = (await self._info(path))["size"] if size is None else size + if effective_end > size: + length = max(0, size - offset) # Clamp and ensure non-negative return offset, length @@ -198,9 +201,20 @@ async def _is_zonal_bucket(self, bucket): bucket_type = await self._lookup_bucket_type(bucket) return bucket_type == BucketType.ZONAL_HIERARCHICAL - async def _cat_file(self, path, start=None, end=None, **kwargs): - """ - Fetch a file's contents as bytes. + async def _cat_file(self, path, start=None, end=None, mrd=None, **kwargs): + """Fetch a file's contents as bytes, with an optimized path for Zonal buckets. + + This method overrides the parent `_cat_file` to read objects in Zonal buckets using gRPC. + + Args: + path (str): The full GCS path to the file (e.g., "bucket/object"). + start (int, optional): The starting byte position to read from. + end (int, optional): The ending byte position to read to. + mrd (AsyncMultiRangeDownloader, optional): An existing multi-range + downloader instance. If not provided, a new one will be created for Zonal buckets. + + Returns: + bytes: The content of the file or file range. """ mrd = kwargs.pop("mrd", None) mrd_created = False @@ -213,7 +227,7 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): if not await self._is_zonal_bucket(bucket): return await super()._cat_file(path, start=start, end=end, **kwargs) - mrd = await zb_hns_utils.create_mrd( + mrd = await AsyncMultiRangeDownloader.create_mrd( self.grpc_client, bucket, object_name, generation ) mrd_created = True @@ -226,6 +240,6 @@ async def _cat_file(self, path, start=None, end=None, **kwargs): offset=offset, length=length, mrd=mrd ) finally: - # Explicit cleanup if we created the MRD and it has a close method + # Explicit cleanup if we created the MRD if mrd_created: await mrd.close() diff --git a/gcsfs/tests/test_extended_gcsfs.py b/gcsfs/tests/test_extended_gcsfs.py index 2e160826..1ff2b653 100644 --- a/gcsfs/tests/test_extended_gcsfs.py +++ b/gcsfs/tests/test_extended_gcsfs.py @@ -40,7 +40,10 @@ def _zonal_mocks_factory(file_data): patch_target_sync_lookup_bucket_type = ( "gcsfs.extended_gcsfs.ExtendedGcsFileSystem._sync_lookup_bucket_type" ) - patch_target_create_mrd = "gcsfs.extended_gcsfs.zb_hns_utils.create_mrd" + patch_target_create_mrd = ( + "google.cloud.storage._experimental.asyncio.async_multi_range_downloader" + ".AsyncMultiRangeDownloader.create_mrd" + ) patch_target_gcsfs_cat_file = "gcsfs.core.GCSFileSystem._cat_file" async def download_side_effect(read_requests, **kwargs): diff --git a/gcsfs/tests/test_zb_hns_utils.py b/gcsfs/tests/test_zb_hns_utils.py index 78976230..a64e6793 100644 --- a/gcsfs/tests/test_zb_hns_utils.py +++ b/gcsfs/tests/test_zb_hns_utils.py @@ -5,33 +5,6 @@ from gcsfs import zb_hns_utils -@pytest.mark.asyncio -async def test_create_mrd(): - """ - Tests that create_mrd calls the underlying AsyncMultiRangeDownloader.create_mrd - with the correct arguments and returns its result. - """ - mock_grpc_client = mock.Mock() - bucket_name = "test-bucket" - object_name = "test-object" - generation = "12345" - mock_mrd_instance = mock.AsyncMock() - - with mock.patch( - "gcsfs.zb_hns_utils.AsyncMultiRangeDownloader.create_mrd", - new_callable=mock.AsyncMock, - return_value=mock_mrd_instance, - ) as mock_create: - result = await zb_hns_utils.create_mrd( - mock_grpc_client, bucket_name, object_name, generation - ) - - mock_create.assert_called_once_with( - mock_grpc_client, bucket_name, object_name, generation - ) - assert result is mock_mrd_instance - - @pytest.mark.asyncio async def test_download_range(): """ diff --git a/gcsfs/zb_hns_utils.py b/gcsfs/zb_hns_utils.py index 3ddf2c37..648974e2 100644 --- a/gcsfs/zb_hns_utils.py +++ b/gcsfs/zb_hns_utils.py @@ -1,19 +1,5 @@ from io import BytesIO -from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( - AsyncMultiRangeDownloader, -) - - -async def create_mrd(grpc_client, bucket_name, object_name, generation=None): - """ - Creates the AsyncMultiRangeDownloader. - """ - mrd = await AsyncMultiRangeDownloader.create_mrd( - grpc_client, bucket_name, object_name, generation - ) - return mrd - async def download_range(offset, length, mrd): """ diff --git a/gcsfs/zonal_file.py b/gcsfs/zonal_file.py index 5e37c9c1..93afb84c 100644 --- a/gcsfs/zonal_file.py +++ b/gcsfs/zonal_file.py @@ -1,6 +1,8 @@ from fsspec import asyn +from google.cloud.storage._experimental.asyncio.async_multi_range_downloader import ( + AsyncMultiRangeDownloader, +) -from gcsfs import zb_hns_utils from gcsfs.core import GCSFile @@ -29,7 +31,7 @@ async def _init_mrd(self, bucket_name, object_name, generation=None): """ Initializes the AsyncMultiRangeDownloader. """ - return await zb_hns_utils.create_mrd( + return await AsyncMultiRangeDownloader.create_mrd( self.gcsfs.grpc_client, bucket_name, object_name, generation )