fsspec · Mahalaxmibejugam · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/gcsfs/extended_gcsfs.py b/gcsfs/extended_gcsfs.py
@@ -21,6 +21,7 @@
 from gcsfs import __version__ as version
 from gcsfs import zb_hns_utils
 from gcsfs.core import GCSFile, GCSFileSystem
+from gcsfs.retry import execute_with_timebound_retry
 from gcsfs.zonal_file import ZonalFile
 
 logger = logging.getLogger("gcsfs")
@@ -130,7 +131,9 @@ async def _get_bucket_type(self, bucket):
             client = await self._get_control_plane_client()
             bucket_name_value = f"projects/_/buckets/{bucket}/storageLayout"
             logger.debug(f"get_storage_layout request for name: {bucket_name_value}")
-            response = await client.get_storage_layout(name=bucket_name_value)
+            response = await execute_with_timebound_retry(
+                client.get_storage_layout, name=bucket_name_value
+            )
 
             if response.location_type == "zone":
                 return BucketType.ZONAL_HIERARCHICAL
@@ -509,7 +512,9 @@ async def _mv(self, path1, path2, **kwargs):
 
                 logger.debug(f"rename_folder request: {request}")
                 client = await self._get_control_plane_client()
-                operation = await client.rename_folder(request=request)
+                operation = await execute_with_timebound_retry(
+                    client.rename_folder, request=request
+                )
                 await operation.result()
                 self._update_dircache_after_rename(path1, path2)
 
@@ -622,7 +627,7 @@ async def _mkdir(
         try:
             logger.debug(f"create_folder request: {request}")
             client = await self._get_control_plane_client()
-            await client.create_folder(request=request)
+            await execute_with_timebound_retry(client.create_folder, request=request)
             # Instead of invalidating the parent cache, update it to add the new entry.
             parent_path = self._parent(path)
             if parent_path in self.dircache:
@@ -668,7 +673,9 @@ async def _get_directory_info(self, path, bucket, key, generation):
 
                 # Verify existence using get_folder API
                 client = await self._get_control_plane_client()
-                response = await client.get_folder(request=request)
+                response = await execute_with_timebound_retry(
+                    client.get_folder, request=request
+                )
 
                 # If successful, return directory metadata
                 return {
@@ -741,7 +748,7 @@ async def _rmdir(self, path):
 
             logger.debug(f"delete_folder request: {request}")
             client = await self._get_control_plane_client()
-            await client.delete_folder(request=request)
+            await execute_with_timebound_retry(client.delete_folder, request=request)
 
             # Remove the directory from the cache and from its parent's listing.
             self.dircache.pop(path, None)

diff --git a/gcsfs/retry.py b/gcsfs/retry.py
@@ -7,6 +7,7 @@
 import google.auth.exceptions
 import requests.exceptions
 from decorator import decorator
+from google.api_core import exceptions as api_exceptions
 
 logger = logging.getLogger("gcsfs")
 
@@ -176,3 +177,61 @@ async def retry_request(func, retries=6, *args, **kwargs):
                 continue
             logger.exception(f"{func.__name__} non-retriable exception: {e}")
             raise e
+
+
+async def execute_with_timebound_retry(
+    func, *args, retry_deadline=30.0, max_retries=6, **kwargs
+):
+    """
+    Executes a gRPC storage control API call with a strict per-attempt timeout and an overall
+    maximum number of retries. Transient errors and timeouts will trigger an exponential backoff loop.
+    """
+    attempt = 0
+    while True:
+        try:
+            # We enforce a per-call timeout by passing `timeout=retry_deadline` to the API call.
+            # asyncio.wait_for serves as a hard local fallback to cancel the task if the gRPC timeout fails to abort.
+            return await asyncio.wait_for(
+                func(*args, timeout=retry_deadline, retry=None, **kwargs),
+                timeout=retry_deadline + 1.0,
+            )
+        except Exception as e:
+            # Determine if the exception is transient and should be retried.
+            is_transient = isinstance(
+                e,
+                (
+                    api_exceptions.RetryError,
+                    api_exceptions.DeadlineExceeded,
+                    api_exceptions.ServiceUnavailable,
+                    api_exceptions.InternalServerError,
+                    api_exceptions.TooManyRequests,
+                    api_exceptions.ResourceExhausted,
+                    api_exceptions.Unknown,
+                    asyncio.TimeoutError,
+                ),
+            )
+
+            # Workaround: retry on 401s / Unauthenticated during transient token lapses
+            if (
+                not is_transient
+                and isinstance(e, api_exceptions.Unauthenticated)
+                and "Invalid Credentials" in str(e)
+            ):
+                is_transient = True
+
+            if not is_transient:
+                raise e
+
+            attempt += 1
+
+            if max_retries is not None and attempt >= max_retries:
+                logger.exception(
+                    f"{func.__name__} out of max retries ({max_retries}) on exception: {e}"
+                )
+                raise e
+
+            sleep_time = min(random.random() + 2 ** (attempt - 1), 32)
+            logger.debug(
+                f"{func.__name__} retrying (attempt {attempt}) after {sleep_time:.2f}s due to exception: {e}"
+            )
+            await asyncio.sleep(sleep_time)
diff --git a/gcsfs/tests/integration/test_extended_hns.py b/gcsfs/tests/integration/test_extended_hns.py
@@ -105,6 +105,40 @@ def test_hns_empty_folder_rename_success(self, gcs_hns):
         assert not gcsfs.exists(path1)
         assert gcsfs.exists(path2)
 
+    def test_hns_folder_rename_idempotency_retry_integration(self, gcs_hns):
+        """Test that retries explicitly work securely against real GCS by forcefully timing out the client."""
+        gcsfs = gcs_hns
+        path1 = f"{TEST_HNS_BUCKET}/integration_retry_old_dir_{uuid.uuid4().hex}"
+        path2 = f"{TEST_HNS_BUCKET}/integration_retry_new_dir_{uuid.uuid4().hex}"
+
+        gcsfs.pipe(f"{path1}/file.txt", b"data")
+
+        import asyncio
+
+        original_wait_for = asyncio.wait_for
+        call_count = 0
+
+        async def mocked_wait_for(coro, timeout):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                # Schedule the actual GCS network request in the background
+                asyncio.create_task(coro)
+                # Intentionally timeout client-side before GCS can return the response
+                await asyncio.sleep(0.01)
+                raise asyncio.TimeoutError()
+
+            return await original_wait_for(coro, timeout)
+
+        from unittest import mock
+
+        with mock.patch("gcsfs.retry.asyncio.wait_for", new=mocked_wait_for):
+            gcsfs.mv(path1, path2)
+
+        assert call_count >= 2
+        assert not gcsfs.exists(path1)
+        assert gcsfs.exists(path2)
+
     def test_file_rename_using_atomic_mv(
         self,
         gcs_hns,