Skip to content

Commit a46df25

Browse files
Update find cache and rm tests (#797)
* Fix find maxdepth caching * Add rm wildcard tests for HNS and standard buckets - test_rm_wildcard_with_empty_folders for HNS - test_rm_wildcards_asterisk_non_recursive for HNS - test_rm_wildcards_and_lists for standard buckets * break rm test into smaller tests * exclude rm wildcard non-recursive test from zonal bucket * Update HNS docs for non-recursive wildcard rm behavior * fix build in CI pipeline
1 parent 1ec7a98 commit a46df25

File tree

6 files changed

+177
-5
lines changed

6 files changed

+177
-5
lines changed

cloudbuild/run_tests.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,15 @@ case "$TEST_SUITE" in
5656
# - test_core_versioned.py: HNS buckets do not support versioning.
5757
# - test_core.py::test_sign: Current Cloud Build auth setup does not support this.
5858
# - test_core.py::test_mv_file_cache: Integration test only applicable for regional buckets.
59+
# - test_core.py::test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
5960
pytest "${ARGS[@]}" gcsfs/ \
6061
--deselect gcsfs/tests/test_extended_gcsfs.py \
6162
--deselect gcsfs/tests/test_zonal_file.py \
6263
--deselect gcsfs/tests/test_extended_gcsfs_unit.py \
6364
--deselect gcsfs/tests/test_core_versioned.py \
6465
--deselect gcsfs/tests/test_core.py::test_sign \
65-
--deselect gcsfs/tests/test_core.py::test_mv_file_cache
66+
--deselect gcsfs/tests/test_core.py::test_mv_file_cache \
67+
--deselect gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive
6668
;;
6769

6870
"zonal-core")
@@ -107,6 +109,7 @@ case "$TEST_SUITE" in
107109
# - test_array fails due to CRC32C TypeError with array objects.
108110
# - test_sign fails because it requires a private key
109111
# - test_mv_file_cache: Integration test only applicable for regional buckets.
112+
# - test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
110113
ZONAL_DESELECTS+=(
111114
"--deselect=gcsfs/tests/test_core.py::test_flush"
112115
"--deselect=gcsfs/tests/test_core.py::test_write_blocks"
@@ -115,6 +118,7 @@ case "$TEST_SUITE" in
115118
"--deselect=gcsfs/tests/test_core.py::test_array"
116119
"--deselect=gcsfs/tests/test_core.py::test_sign"
117120
"--deselect=gcsfs/tests/test_core.py::test_mv_file_cache"
121+
"--deselect=gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive"
118122
)
119123

120124
pytest "${ARGS[@]}" "${ZONAL_DESELECTS[@]}" gcsfs/tests/test_core.py

docs/source/hns_buckets.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ While ``gcsfs`` aims to abstract the differences via the ``fsspec`` API, you sho
5656
2. **``mkdir`` behavior:** Previously, in a flat namespace, calling ``mkdir`` on a path could only ensure the underlying bucket exists. With HNS enabled, calling ``mkdir`` will create an actual folder resource in GCS. Furthermore, if you want to create nested folders (eg: bucket/a/b/c/d), pass ``create_parents=True``, it will physically create all intermediate folder resources along the specified path.
5757
3. **No mixing or toggling:** You cannot toggle HNS on an existing flat-namespace bucket. You must create a new HNS bucket and migrate your data.
5858
4. **Object naming:** Object names in HNS cannot end with a slash (``/``) unless without the creation of physical folder resources.
59-
5. **Rename Operation Benchmarks**
59+
5. **Non-Recursive Wildcard Deletions:** When using wildcard deletions without recursion (e.g., ``gcs.rm("bucket/dir/*", recursive=False)``), if the pattern matches a non-empty directory, HNS buckets will raise an error (typically surfaced as an ``OSError`` due to a failed precondition). In contrast, standard flat-namespace buckets silently ignore non-empty directories under the same circumstances.
60+
6. **Rename Operation Benchmarks**
6061

6162
The following benchmarks show the time taken (in seconds) to rename a directory containing a large number of files (spread across 256 folders and 8 levels) in a standard Regional bucket versus an HNS bucket (can be replicated using `gcsfs/tests/perf/microbenchmarks/rename`):
6263

gcsfs/extended_gcsfs.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ async def _expand_path_with_details(
811811
raise ValueError("maxdepth must be at least 1")
812812

813813
if isinstance(path, str):
814-
out = await self._expand_path_with_details(
814+
return await self._expand_path_with_details(
815815
[path], recursive, maxdepth, detail=detail
816816
)
817817
else:
@@ -1000,6 +1000,8 @@ async def _find(
10001000
For buckets with flat structure, it falls back to the parent implementation.
10011001
"""
10021002
path = self._strip_protocol(path)
1003+
if maxdepth is not None and maxdepth < 1:
1004+
raise ValueError("maxdepth must be at least 1")
10031005
bucket, _, _ = self.split_path(path)
10041006

10051007
is_hns = await self._is_bucket_hns_enabled(bucket)
@@ -1017,14 +1019,16 @@ async def _find(
10171019

10181020
# Hybrid approach for HNS enabled buckets
10191021
# 1. Fetch all files from super find() method by passing withdirs as False.
1022+
# We pass maxdepth as None here to ensure we fetch all files for caching,
1023+
# and then filter by maxdepth at the end of this method.
10201024
files_task = asyncio.create_task(
10211025
super()._find(
10221026
path,
10231027
withdirs=False, # Fetch files only
10241028
detail=True, # Get full details for merging and populating cache
10251029
prefix=prefix,
10261030
versions=versions,
1027-
maxdepth=maxdepth,
1031+
maxdepth=None,
10281032
update_cache=False, # Defer caching until merging files and folders
10291033
**kwargs,
10301034
)

gcsfs/tests/integration/test_extended_hns.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,66 @@ def test_rm_empty_folder_cache_invalidation(self, gcs_hns):
918918
sibling_dir in gcsfs.dircache
919919
), "Sibling directory cache should not be invalidated."
920920

921+
def test_rm_wildcard_with_empty_folders(self, gcs_hns):
922+
"""Test deleting files and empty folders using wildcards on HNS bucket."""
923+
gcsfs = gcs_hns
924+
base_dir = f"{TEST_HNS_BUCKET}/rm_wildcard_empty_{uuid.uuid4().hex}"
925+
926+
# Setup: Some files, some empty folders, some non-empty folders
927+
empty_dir1 = f"{base_dir}/empty_1"
928+
empty_dir2 = f"{base_dir}/empty_2"
929+
empty_dir3 = f"{base_dir}/one_more_empty_dir"
930+
other_dir = f"{base_dir}/other_dir"
931+
file1 = f"{base_dir}/file1.txt"
932+
933+
gcsfs.mkdir(empty_dir1, create_parents=True)
934+
gcsfs.mkdir(empty_dir2)
935+
gcsfs.touch(file1)
936+
gcsfs.mkdir(empty_dir3)
937+
gcsfs.touch(f"{other_dir}/file2.txt")
938+
939+
# 1. Remove only matching empty folders using wildcard
940+
gcsfs.rm(f"{base_dir}/empty_*", recursive=True)
941+
942+
assert not gcsfs.exists(empty_dir1)
943+
assert not gcsfs.exists(empty_dir2)
944+
assert gcsfs.exists(file1)
945+
assert gcsfs.exists(other_dir)
946+
947+
# 2. Remove remaining using a broad wildcard
948+
gcsfs.rm(f"{base_dir}/*", recursive=True)
949+
950+
assert not gcsfs.exists(file1)
951+
assert not gcsfs.exists(other_dir)
952+
assert not gcsfs.exists(empty_dir3)
953+
954+
# Verify base_dir is now empty (HNS might still have the base_dir itself)
955+
assert gcsfs.ls(base_dir) == []
956+
957+
gcsfs.rm(base_dir, recursive=True)
958+
959+
assert not gcsfs.exists(base_dir)
960+
961+
def test_rm_wildcards_non_recursive(self, gcs_hns):
962+
"""Test 'base_dir/*' (non-recursive) raises an error on non-empty directories for HNS buckets."""
963+
gcsfs = gcs_hns
964+
base_dir = f"{TEST_HNS_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
965+
files = [
966+
f"{base_dir}/other.txt",
967+
f"{base_dir}/subdir/nested.txt",
968+
]
969+
for f in files:
970+
gcsfs.touch(f)
971+
972+
# 3. Test 'base_dir/*' (non-recursive)
973+
# For HNS, this matches `other.txt` and `subdir`. Because `subdir` is not empty,
974+
# delete_folder raises FailedPrecondition, which currently wraps to OSError.
975+
with pytest.raises(OSError, match="Pre condition failed"):
976+
gcsfs.rm(f"{base_dir}/*")
977+
978+
assert not gcsfs.exists(files[0])
979+
assert gcsfs.exists(files[1]) # subdir file still exists
980+
921981

922982
@pytest.fixture()
923983
def test_structure(gcs_hns):
@@ -1052,6 +1112,28 @@ def test_find_updates_dircache_without_prefix(
10521112
}
10531113
assert not empty_dir_listing
10541114

1115+
def test_find_maxdepth_updates_cache(self, gcs_hns, test_structure):
1116+
"""Test that find with maxdepth updates cache for deeper objects."""
1117+
base_dir = test_structure["base_dir"]
1118+
gcs_hns.invalidate_cache()
1119+
assert not gcs_hns.dircache
1120+
1121+
# Run find with maxdepth=1
1122+
# This should return root_file, empty_dir, dir_with_files
1123+
# But it should also cache objects within dir_with_files, etc.
1124+
result = gcs_hns.find(base_dir, maxdepth=1)
1125+
assert test_structure["root_file"] in result
1126+
assert test_structure["file1"] not in result
1127+
1128+
# Verify that the cache is populated for deeper objects even if not returned
1129+
# from find due to maxdepth filter
1130+
assert test_structure["dir_with_files"] in gcs_hns.dircache
1131+
dir_with_files_cache = {
1132+
d["name"] for d in gcs_hns.dircache[test_structure["dir_with_files"]]
1133+
}
1134+
assert test_structure["file1"] in dir_with_files_cache
1135+
assert test_structure["nested_dir"] in dir_with_files_cache
1136+
10551137
def test_find_does_not_update_dircache_with_prefix(self, gcs_hns, test_structure):
10561138
"""Test that find() does NOT populate the dircache when a prefix is given."""
10571139
base_dir = test_structure["base_dir"]

gcsfs/tests/test_core.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,87 @@ def test_rm_chunked_batch(gcs):
354354
assert fn not in files_removed
355355

356356

357+
def test_rm_wildcards_in_directory(gcs):
358+
base_dir = f"{TEST_BUCKET}/test_rm_complex_{uuid.uuid4().hex}"
359+
files = [
360+
f"{base_dir}/file1.txt",
361+
f"{base_dir}/file2.txt",
362+
f"{base_dir}/other.txt",
363+
f"{base_dir}/a1.dat",
364+
f"{base_dir}/b1.dat",
365+
f"{base_dir}/subdir/nested.txt",
366+
]
367+
for f in files:
368+
gcs.touch(f)
369+
370+
# 1. Test '?' wildcard (non-recursive)
371+
gcs.rm(f"{base_dir}/file?.txt")
372+
assert not gcs.exists(files[0])
373+
assert not gcs.exists(files[1])
374+
assert gcs.exists(files[2])
375+
assert gcs.exists(files[5])
376+
377+
# 2. Test list of patterns with '*' wildcards (non-recursive)
378+
gcs.rm([f"{base_dir}/a*", f"{base_dir}/b*"])
379+
assert not gcs.exists(files[3])
380+
assert not gcs.exists(files[4])
381+
assert gcs.exists(files[2])
382+
assert gcs.exists(files[5]) # subdir file still exists
383+
384+
# 3. Test non-matching pattern
385+
with pytest.raises(FileNotFoundError):
386+
gcs.rm(f"{base_dir}/non_existent*")
387+
388+
# 4. Test recursive wildcard cleanup
389+
gcs.rm(f"{base_dir}/**", recursive=True)
390+
assert not gcs.exists(base_dir)
391+
392+
393+
def test_rm_wildcard_bucket_non_recursive(gcs):
394+
"""Tests non-recursive rm with a wildcard at the bucket root."""
395+
# Test 'bucket/*' (non-recursive)
396+
new_bucket = f"gcsfs-test-rm-{uuid.uuid4().hex}"
397+
gcs.mkdir(new_bucket)
398+
try:
399+
gcs.touch(f"{new_bucket}/file1")
400+
gcs.touch(f"{new_bucket}/subdir/file2")
401+
gcs.rm(f"{new_bucket}/*")
402+
assert not gcs.exists(f"{new_bucket}/file1")
403+
assert gcs.exists(f"{new_bucket}/subdir/file2")
404+
finally:
405+
gcs.rm(new_bucket, recursive=True)
406+
407+
408+
def test_rm_wildcard_bucket_recursive(gcs):
409+
"""Tests recursive rm with a wildcard at the bucket root."""
410+
# Test 'bucket/*' (recursive)
411+
new_bucket = f"gcsfs-test-rm-recursive-{uuid.uuid4().hex}"
412+
gcs.mkdir(new_bucket)
413+
try:
414+
gcs.touch(f"{new_bucket}/file1")
415+
gcs.touch(f"{new_bucket}/subdir/file2")
416+
gcs.rm(f"{new_bucket}/*", recursive=True)
417+
assert not gcs.exists(f"{new_bucket}/file1")
418+
assert not gcs.exists(f"{new_bucket}/subdir/file2")
419+
finally:
420+
if gcs.exists(new_bucket):
421+
gcs.rm(new_bucket, recursive=True)
422+
423+
424+
def test_rm_wildcards_non_recursive(gcs):
425+
base_dir = f"{TEST_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
426+
files = [
427+
f"{base_dir}/other.txt",
428+
f"{base_dir}/subdir/nested.txt",
429+
]
430+
for f in files:
431+
gcs.touch(f)
432+
433+
gcs.rm(f"{base_dir}/*")
434+
assert not gcs.exists(files[0])
435+
assert gcs.exists(files[1]) # subdir file still exists
436+
437+
357438
def test_file_access(gcs):
358439
fn = TEST_BUCKET + "/nested/file1"
359440
data = b"hello\n"

gcsfs/tests/test_extended_hns_gcsfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1116,7 +1116,7 @@ def test_hns_find_withdirs_maxdepth(self, gcs_hns, gcs_hns_mocks):
11161116
_, call_kwargs = mocks["super_find"].call_args
11171117
assert call_kwargs.get("withdirs") is False
11181118
assert call_kwargs.get("detail") is True
1119-
assert call_kwargs.get("maxdepth") == 1
1119+
assert call_kwargs.get("maxdepth") is None
11201120

11211121
# Assert that list_folders was called with the correct request
11221122
expected_folder_id = "find_test/"

0 commit comments

Comments
 (0)