diff --git a/cloudbuild/run_tests.sh b/cloudbuild/run_tests.sh index a970015d4..9b13fe7c7 100644 --- a/cloudbuild/run_tests.sh +++ b/cloudbuild/run_tests.sh @@ -56,13 +56,15 @@ case "$TEST_SUITE" in # - test_core_versioned.py: HNS buckets do not support versioning. # - test_core.py::test_sign: Current Cloud Build auth setup does not support this. # - test_core.py::test_mv_file_cache: Integration test only applicable for regional buckets. + # - test_core.py::test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion. pytest "${ARGS[@]}" gcsfs/ \ --deselect gcsfs/tests/test_extended_gcsfs.py \ --deselect gcsfs/tests/test_zonal_file.py \ --deselect gcsfs/tests/test_extended_gcsfs_unit.py \ --deselect gcsfs/tests/test_core_versioned.py \ --deselect gcsfs/tests/test_core.py::test_sign \ - --deselect gcsfs/tests/test_core.py::test_mv_file_cache + --deselect gcsfs/tests/test_core.py::test_mv_file_cache \ + --deselect gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive ;; "zonal-core") @@ -107,6 +109,7 @@ case "$TEST_SUITE" in # - test_array fails due to CRC32C TypeError with array objects. # - test_sign fails because it requires a private key # - test_mv_file_cache: Integration test only applicable for regional buckets. + # - test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion. ZONAL_DESELECTS+=( "--deselect=gcsfs/tests/test_core.py::test_flush" "--deselect=gcsfs/tests/test_core.py::test_write_blocks" @@ -115,6 +118,7 @@ case "$TEST_SUITE" in "--deselect=gcsfs/tests/test_core.py::test_array" "--deselect=gcsfs/tests/test_core.py::test_sign" "--deselect=gcsfs/tests/test_core.py::test_mv_file_cache" + "--deselect=gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive" ) pytest "${ARGS[@]}" "${ZONAL_DESELECTS[@]}" gcsfs/tests/test_core.py diff --git a/docs/source/hns_buckets.rst b/docs/source/hns_buckets.rst index 65de1ad48..fe55be918 100644 --- a/docs/source/hns_buckets.rst +++ b/docs/source/hns_buckets.rst @@ -56,7 +56,8 @@ While ``gcsfs`` aims to abstract the differences via the ``fsspec`` API, you sho 2. **``mkdir`` behavior:** Previously, in a flat namespace, calling ``mkdir`` on a path could only ensure the underlying bucket exists. With HNS enabled, calling ``mkdir`` will create an actual folder resource in GCS. Furthermore, if you want to create nested folders (eg: bucket/a/b/c/d), pass ``create_parents=True``, it will physically create all intermediate folder resources along the specified path. 3. **No mixing or toggling:** You cannot toggle HNS on an existing flat-namespace bucket. You must create a new HNS bucket and migrate your data. 4. **Object naming:** Object names in HNS cannot end with a slash (``/``) unless without the creation of physical folder resources. -5. **Rename Operation Benchmarks** +5. **Non-Recursive Wildcard Deletions:** When using wildcard deletions without recursion (e.g., ``gcs.rm("bucket/dir/*", recursive=False)``), if the pattern matches a non-empty directory, HNS buckets will raise an error (typically surfaced as an ``OSError`` due to a failed precondition). In contrast, standard flat-namespace buckets silently ignore non-empty directories under the same circumstances. +6. **Rename Operation Benchmarks** The following benchmarks show the time taken (in seconds) to rename a directory containing a large number of files (spread across 256 folders and 8 levels) in a standard Regional bucket versus an HNS bucket (can be replicated using `gcsfs/tests/perf/microbenchmarks/rename`): diff --git a/gcsfs/extended_gcsfs.py b/gcsfs/extended_gcsfs.py index 3cf3495d2..a8f91a3bc 100644 --- a/gcsfs/extended_gcsfs.py +++ b/gcsfs/extended_gcsfs.py @@ -811,7 +811,7 @@ async def _expand_path_with_details( raise ValueError("maxdepth must be at least 1") if isinstance(path, str): - out = await self._expand_path_with_details( + return await self._expand_path_with_details( [path], recursive, maxdepth, detail=detail ) else: @@ -1000,6 +1000,8 @@ async def _find( For buckets with flat structure, it falls back to the parent implementation. """ path = self._strip_protocol(path) + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") bucket, _, _ = self.split_path(path) is_hns = await self._is_bucket_hns_enabled(bucket) @@ -1017,6 +1019,8 @@ async def _find( # Hybrid approach for HNS enabled buckets # 1. Fetch all files from super find() method by passing withdirs as False. + # We pass maxdepth as None here to ensure we fetch all files for caching, + # and then filter by maxdepth at the end of this method. files_task = asyncio.create_task( super()._find( path, @@ -1024,7 +1028,7 @@ async def _find( detail=True, # Get full details for merging and populating cache prefix=prefix, versions=versions, - maxdepth=maxdepth, + maxdepth=None, update_cache=False, # Defer caching until merging files and folders **kwargs, ) diff --git a/gcsfs/tests/integration/test_extended_hns.py b/gcsfs/tests/integration/test_extended_hns.py index f1ab386a4..8c42a5b70 100644 --- a/gcsfs/tests/integration/test_extended_hns.py +++ b/gcsfs/tests/integration/test_extended_hns.py @@ -918,6 +918,66 @@ def test_rm_empty_folder_cache_invalidation(self, gcs_hns): sibling_dir in gcsfs.dircache ), "Sibling directory cache should not be invalidated." + def test_rm_wildcard_with_empty_folders(self, gcs_hns): + """Test deleting files and empty folders using wildcards on HNS bucket.""" + gcsfs = gcs_hns + base_dir = f"{TEST_HNS_BUCKET}/rm_wildcard_empty_{uuid.uuid4().hex}" + + # Setup: Some files, some empty folders, some non-empty folders + empty_dir1 = f"{base_dir}/empty_1" + empty_dir2 = f"{base_dir}/empty_2" + empty_dir3 = f"{base_dir}/one_more_empty_dir" + other_dir = f"{base_dir}/other_dir" + file1 = f"{base_dir}/file1.txt" + + gcsfs.mkdir(empty_dir1, create_parents=True) + gcsfs.mkdir(empty_dir2) + gcsfs.touch(file1) + gcsfs.mkdir(empty_dir3) + gcsfs.touch(f"{other_dir}/file2.txt") + + # 1. Remove only matching empty folders using wildcard + gcsfs.rm(f"{base_dir}/empty_*", recursive=True) + + assert not gcsfs.exists(empty_dir1) + assert not gcsfs.exists(empty_dir2) + assert gcsfs.exists(file1) + assert gcsfs.exists(other_dir) + + # 2. Remove remaining using a broad wildcard + gcsfs.rm(f"{base_dir}/*", recursive=True) + + assert not gcsfs.exists(file1) + assert not gcsfs.exists(other_dir) + assert not gcsfs.exists(empty_dir3) + + # Verify base_dir is now empty (HNS might still have the base_dir itself) + assert gcsfs.ls(base_dir) == [] + + gcsfs.rm(base_dir, recursive=True) + + assert not gcsfs.exists(base_dir) + + def test_rm_wildcards_non_recursive(self, gcs_hns): + """Test 'base_dir/*' (non-recursive) raises an error on non-empty directories for HNS buckets.""" + gcsfs = gcs_hns + base_dir = f"{TEST_HNS_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}" + files = [ + f"{base_dir}/other.txt", + f"{base_dir}/subdir/nested.txt", + ] + for f in files: + gcsfs.touch(f) + + # 3. Test 'base_dir/*' (non-recursive) + # For HNS, this matches `other.txt` and `subdir`. Because `subdir` is not empty, + # delete_folder raises FailedPrecondition, which currently wraps to OSError. + with pytest.raises(OSError, match="Pre condition failed"): + gcsfs.rm(f"{base_dir}/*") + + assert not gcsfs.exists(files[0]) + assert gcsfs.exists(files[1]) # subdir file still exists + @pytest.fixture() def test_structure(gcs_hns): @@ -1052,6 +1112,28 @@ def test_find_updates_dircache_without_prefix( } assert not empty_dir_listing + def test_find_maxdepth_updates_cache(self, gcs_hns, test_structure): + """Test that find with maxdepth updates cache for deeper objects.""" + base_dir = test_structure["base_dir"] + gcs_hns.invalidate_cache() + assert not gcs_hns.dircache + + # Run find with maxdepth=1 + # This should return root_file, empty_dir, dir_with_files + # But it should also cache objects within dir_with_files, etc. + result = gcs_hns.find(base_dir, maxdepth=1) + assert test_structure["root_file"] in result + assert test_structure["file1"] not in result + + # Verify that the cache is populated for deeper objects even if not returned + # from find due to maxdepth filter + assert test_structure["dir_with_files"] in gcs_hns.dircache + dir_with_files_cache = { + d["name"] for d in gcs_hns.dircache[test_structure["dir_with_files"]] + } + assert test_structure["file1"] in dir_with_files_cache + assert test_structure["nested_dir"] in dir_with_files_cache + def test_find_does_not_update_dircache_with_prefix(self, gcs_hns, test_structure): """Test that find() does NOT populate the dircache when a prefix is given.""" base_dir = test_structure["base_dir"] diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 285ee03f8..26f8c0542 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -354,6 +354,87 @@ def test_rm_chunked_batch(gcs): assert fn not in files_removed +def test_rm_wildcards_in_directory(gcs): + base_dir = f"{TEST_BUCKET}/test_rm_complex_{uuid.uuid4().hex}" + files = [ + f"{base_dir}/file1.txt", + f"{base_dir}/file2.txt", + f"{base_dir}/other.txt", + f"{base_dir}/a1.dat", + f"{base_dir}/b1.dat", + f"{base_dir}/subdir/nested.txt", + ] + for f in files: + gcs.touch(f) + + # 1. Test '?' wildcard (non-recursive) + gcs.rm(f"{base_dir}/file?.txt") + assert not gcs.exists(files[0]) + assert not gcs.exists(files[1]) + assert gcs.exists(files[2]) + assert gcs.exists(files[5]) + + # 2. Test list of patterns with '*' wildcards (non-recursive) + gcs.rm([f"{base_dir}/a*", f"{base_dir}/b*"]) + assert not gcs.exists(files[3]) + assert not gcs.exists(files[4]) + assert gcs.exists(files[2]) + assert gcs.exists(files[5]) # subdir file still exists + + # 3. Test non-matching pattern + with pytest.raises(FileNotFoundError): + gcs.rm(f"{base_dir}/non_existent*") + + # 4. Test recursive wildcard cleanup + gcs.rm(f"{base_dir}/**", recursive=True) + assert not gcs.exists(base_dir) + + +def test_rm_wildcard_bucket_non_recursive(gcs): + """Tests non-recursive rm with a wildcard at the bucket root.""" + # Test 'bucket/*' (non-recursive) + new_bucket = f"gcsfs-test-rm-{uuid.uuid4().hex}" + gcs.mkdir(new_bucket) + try: + gcs.touch(f"{new_bucket}/file1") + gcs.touch(f"{new_bucket}/subdir/file2") + gcs.rm(f"{new_bucket}/*") + assert not gcs.exists(f"{new_bucket}/file1") + assert gcs.exists(f"{new_bucket}/subdir/file2") + finally: + gcs.rm(new_bucket, recursive=True) + + +def test_rm_wildcard_bucket_recursive(gcs): + """Tests recursive rm with a wildcard at the bucket root.""" + # Test 'bucket/*' (recursive) + new_bucket = f"gcsfs-test-rm-recursive-{uuid.uuid4().hex}" + gcs.mkdir(new_bucket) + try: + gcs.touch(f"{new_bucket}/file1") + gcs.touch(f"{new_bucket}/subdir/file2") + gcs.rm(f"{new_bucket}/*", recursive=True) + assert not gcs.exists(f"{new_bucket}/file1") + assert not gcs.exists(f"{new_bucket}/subdir/file2") + finally: + if gcs.exists(new_bucket): + gcs.rm(new_bucket, recursive=True) + + +def test_rm_wildcards_non_recursive(gcs): + base_dir = f"{TEST_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}" + files = [ + f"{base_dir}/other.txt", + f"{base_dir}/subdir/nested.txt", + ] + for f in files: + gcs.touch(f) + + gcs.rm(f"{base_dir}/*") + assert not gcs.exists(files[0]) + assert gcs.exists(files[1]) # subdir file still exists + + def test_file_access(gcs): fn = TEST_BUCKET + "/nested/file1" data = b"hello\n" diff --git a/gcsfs/tests/test_extended_hns_gcsfs.py b/gcsfs/tests/test_extended_hns_gcsfs.py index 1c49a9a73..3280ecbb7 100644 --- a/gcsfs/tests/test_extended_hns_gcsfs.py +++ b/gcsfs/tests/test_extended_hns_gcsfs.py @@ -1116,7 +1116,7 @@ def test_hns_find_withdirs_maxdepth(self, gcs_hns, gcs_hns_mocks): _, call_kwargs = mocks["super_find"].call_args assert call_kwargs.get("withdirs") is False assert call_kwargs.get("detail") is True - assert call_kwargs.get("maxdepth") == 1 + assert call_kwargs.get("maxdepth") is None # Assert that list_folders was called with the correct request expected_folder_id = "find_test/"