Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cloudbuild/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ case "$TEST_SUITE" in
# - test_core_versioned.py: HNS buckets do not support versioning.
# - test_core.py::test_sign: Current Cloud Build auth setup does not support this.
# - test_core.py::test_mv_file_cache: Integration test only applicable for regional buckets.
# - test_core.py::test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
pytest "${ARGS[@]}" gcsfs/ \
--deselect gcsfs/tests/test_extended_gcsfs.py \
--deselect gcsfs/tests/test_zonal_file.py \
--deselect gcsfs/tests/test_extended_gcsfs_unit.py \
--deselect gcsfs/tests/test_core_versioned.py \
--deselect gcsfs/tests/test_core.py::test_sign \
--deselect gcsfs/tests/test_core.py::test_mv_file_cache
--deselect gcsfs/tests/test_core.py::test_mv_file_cache \
--deselect gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive
;;

"zonal-core")
Expand Down Expand Up @@ -107,6 +109,7 @@ case "$TEST_SUITE" in
# - test_array fails due to CRC32C TypeError with array objects.
# - test_sign fails because it requires a private key
# - test_mv_file_cache: Integration test only applicable for regional buckets.
# - test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
ZONAL_DESELECTS+=(
"--deselect=gcsfs/tests/test_core.py::test_flush"
"--deselect=gcsfs/tests/test_core.py::test_write_blocks"
Expand All @@ -115,6 +118,7 @@ case "$TEST_SUITE" in
"--deselect=gcsfs/tests/test_core.py::test_array"
"--deselect=gcsfs/tests/test_core.py::test_sign"
"--deselect=gcsfs/tests/test_core.py::test_mv_file_cache"
"--deselect=gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive"
)

pytest "${ARGS[@]}" "${ZONAL_DESELECTS[@]}" gcsfs/tests/test_core.py
Expand Down
3 changes: 2 additions & 1 deletion docs/source/hns_buckets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ While ``gcsfs`` aims to abstract the differences via the ``fsspec`` API, you sho
2. **``mkdir`` behavior:** Previously, in a flat namespace, calling ``mkdir`` on a path could only ensure the underlying bucket exists. With HNS enabled, calling ``mkdir`` will create an actual folder resource in GCS. Furthermore, if you want to create nested folders (eg: bucket/a/b/c/d) pass ``create_parents=True``, it will physically create all intermediate folder resources along the specified path.
3. **No mixing or toggling:** You cannot toggle HNS on an existing flat-namespace bucket. You must create a new HNS bucket and migrate your data.
4. **Object naming:** Object names in HNS cannot end with a slash (``/``) unless without the creation of physical folder resources.
5. **Rename Operation Benchmarks**
5. **Non-Recursive Wildcard Deletions:** When using wildcard deletions without recursion (e.g., ``gcs.rm("bucket/dir/*", recursive=False)``), if the pattern matches a non-empty directory, HNS buckets will raise an error (typically surfaced as an ``OSError`` due to a failed precondition). In contrast, standard flat-namespace buckets silently ignore non-empty directories under the same circumstances.
6. **Rename Operation Benchmarks**

The following benchmarks show the time taken (in seconds) to rename a directory containing a large number of files (spread across 256 folders and 8 levels) in a standard Regional bucket versus an HNS bucket (can be replicated using `gcsfs/tests/perf/microbenchmarks/rename`):

Expand Down
8 changes: 6 additions & 2 deletions gcsfs/extended_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ async def _expand_path_with_details(
raise ValueError("maxdepth must be at least 1")

if isinstance(path, str):
out = await self._expand_path_with_details(
return await self._expand_path_with_details(
[path], recursive, maxdepth, detail=detail
)
else:
Expand Down Expand Up @@ -1000,6 +1000,8 @@ async def _find(
For buckets with flat structure, it falls back to the parent implementation.
"""
path = self._strip_protocol(path)
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
bucket, _, _ = self.split_path(path)

is_hns = await self._is_bucket_hns_enabled(bucket)
Expand All @@ -1017,14 +1019,16 @@ async def _find(

# Hybrid approach for HNS enabled buckets
# 1. Fetch all files from super find() method by passing withdirs as False.
# We pass maxdepth as None here to ensure we fetch all files for caching,
# and then filter by maxdepth at the end of this method.
files_task = asyncio.create_task(
super()._find(
path,
withdirs=False, # Fetch files only
detail=True, # Get full details for merging and populating cache
prefix=prefix,
versions=versions,
maxdepth=maxdepth,
maxdepth=None,
update_cache=False, # Defer caching until merging files and folders
**kwargs,
)
Expand Down
82 changes: 82 additions & 0 deletions gcsfs/tests/integration/test_extended_hns.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,66 @@ def test_rm_empty_folder_cache_invalidation(self, gcs_hns):
sibling_dir in gcsfs.dircache
), "Sibling directory cache should not be invalidated."

def test_rm_wildcard_with_empty_folders(self, gcs_hns):
"""Test deleting files and empty folders using wildcards on HNS bucket."""
gcsfs = gcs_hns
base_dir = f"{TEST_HNS_BUCKET}/rm_wildcard_empty_{uuid.uuid4().hex}"

# Setup: Some files, some empty folders, some non-empty folders
empty_dir1 = f"{base_dir}/empty_1"
empty_dir2 = f"{base_dir}/empty_2"
empty_dir3 = f"{base_dir}/one_more_empty_dir"
other_dir = f"{base_dir}/other_dir"
file1 = f"{base_dir}/file1.txt"

gcsfs.mkdir(empty_dir1, create_parents=True)
gcsfs.mkdir(empty_dir2)
gcsfs.touch(file1)
gcsfs.mkdir(empty_dir3)
gcsfs.touch(f"{other_dir}/file2.txt")

# 1. Remove only matching empty folders using wildcard
gcsfs.rm(f"{base_dir}/empty_*", recursive=True)

assert not gcsfs.exists(empty_dir1)
assert not gcsfs.exists(empty_dir2)
assert gcsfs.exists(file1)
assert gcsfs.exists(other_dir)

# 2. Remove remaining using a broad wildcard
gcsfs.rm(f"{base_dir}/*", recursive=True)

assert not gcsfs.exists(file1)
assert not gcsfs.exists(other_dir)
assert not gcsfs.exists(empty_dir3)

# Verify base_dir is now empty (HNS might still have the base_dir itself)
assert gcsfs.ls(base_dir) == []

gcsfs.rm(base_dir, recursive=True)

assert not gcsfs.exists(base_dir)

def test_rm_wildcards_non_recursive(self, gcs_hns):
"""Test 'base_dir/*' (non-recursive) raises an error on non-empty directories for HNS buckets."""
gcsfs = gcs_hns
base_dir = f"{TEST_HNS_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
files = [
f"{base_dir}/other.txt",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcsfs.touch(f)

# 3. Test 'base_dir/*' (non-recursive)
# For HNS, this matches `other.txt` and `subdir`. Because `subdir` is not empty,
# delete_folder raises FailedPrecondition, which currently wraps to OSError.
with pytest.raises(OSError, match="Pre condition failed"):
gcsfs.rm(f"{base_dir}/*")

assert not gcsfs.exists(files[0])
assert gcsfs.exists(files[1]) # subdir file still exists


@pytest.fixture()
def test_structure(gcs_hns):
Expand Down Expand Up @@ -1052,6 +1112,28 @@ def test_find_updates_dircache_without_prefix(
}
assert not empty_dir_listing

def test_find_maxdepth_updates_cache(self, gcs_hns, test_structure):
"""Test that find with maxdepth updates cache for deeper objects."""
base_dir = test_structure["base_dir"]
gcs_hns.invalidate_cache()
assert not gcs_hns.dircache

# Run find with maxdepth=1
# This should return root_file, empty_dir, dir_with_files
# But it should also cache objects within dir_with_files, etc.
result = gcs_hns.find(base_dir, maxdepth=1)
assert test_structure["root_file"] in result
assert test_structure["file1"] not in result

# Verify that the cache is populated for deeper objects even if not returned
# from find due to maxdepth filter
assert test_structure["dir_with_files"] in gcs_hns.dircache
dir_with_files_cache = {
d["name"] for d in gcs_hns.dircache[test_structure["dir_with_files"]]
}
assert test_structure["file1"] in dir_with_files_cache
assert test_structure["nested_dir"] in dir_with_files_cache

def test_find_does_not_update_dircache_with_prefix(self, gcs_hns, test_structure):
"""Test that find() does NOT populate the dircache when a prefix is given."""
base_dir = test_structure["base_dir"]
Expand Down
81 changes: 81 additions & 0 deletions gcsfs/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,87 @@ def test_rm_chunked_batch(gcs):
assert fn not in files_removed


def test_rm_wildcards_in_directory(gcs):
base_dir = f"{TEST_BUCKET}/test_rm_complex_{uuid.uuid4().hex}"
files = [
f"{base_dir}/file1.txt",
f"{base_dir}/file2.txt",
f"{base_dir}/other.txt",
f"{base_dir}/a1.dat",
f"{base_dir}/b1.dat",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcs.touch(f)

# 1. Test '?' wildcard (non-recursive)
gcs.rm(f"{base_dir}/file?.txt")
assert not gcs.exists(files[0])
assert not gcs.exists(files[1])
assert gcs.exists(files[2])
assert gcs.exists(files[5])

# 2. Test list of patterns with '*' wildcards (non-recursive)
gcs.rm([f"{base_dir}/a*", f"{base_dir}/b*"])
assert not gcs.exists(files[3])
assert not gcs.exists(files[4])
assert gcs.exists(files[2])
assert gcs.exists(files[5]) # subdir file still exists

# 3. Test non-matching pattern
with pytest.raises(FileNotFoundError):
gcs.rm(f"{base_dir}/non_existent*")

# 4. Test recursive wildcard cleanup
gcs.rm(f"{base_dir}/**", recursive=True)
assert not gcs.exists(base_dir)


def test_rm_wildcard_bucket_non_recursive(gcs):
"""Tests non-recursive rm with a wildcard at the bucket root."""
# Test 'bucket/*' (non-recursive)
new_bucket = f"gcsfs-test-rm-{uuid.uuid4().hex}"
gcs.mkdir(new_bucket)
try:
gcs.touch(f"{new_bucket}/file1")
gcs.touch(f"{new_bucket}/subdir/file2")
gcs.rm(f"{new_bucket}/*")
assert not gcs.exists(f"{new_bucket}/file1")
assert gcs.exists(f"{new_bucket}/subdir/file2")
finally:
gcs.rm(new_bucket, recursive=True)


def test_rm_wildcard_bucket_recursive(gcs):
"""Tests recursive rm with a wildcard at the bucket root."""
# Test 'bucket/*' (recursive)
new_bucket = f"gcsfs-test-rm-recursive-{uuid.uuid4().hex}"
gcs.mkdir(new_bucket)
try:
gcs.touch(f"{new_bucket}/file1")
gcs.touch(f"{new_bucket}/subdir/file2")
gcs.rm(f"{new_bucket}/*", recursive=True)
assert not gcs.exists(f"{new_bucket}/file1")
assert not gcs.exists(f"{new_bucket}/subdir/file2")
finally:
if gcs.exists(new_bucket):
gcs.rm(new_bucket, recursive=True)


def test_rm_wildcards_non_recursive(gcs):
base_dir = f"{TEST_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
files = [
f"{base_dir}/other.txt",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcs.touch(f)

gcs.rm(f"{base_dir}/*")
assert not gcs.exists(files[0])
assert gcs.exists(files[1]) # subdir file still exists


def test_file_access(gcs):
fn = TEST_BUCKET + "/nested/file1"
data = b"hello\n"
Expand Down
2 changes: 1 addition & 1 deletion gcsfs/tests/test_extended_hns_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ def test_hns_find_withdirs_maxdepth(self, gcs_hns, gcs_hns_mocks):
_, call_kwargs = mocks["super_find"].call_args
assert call_kwargs.get("withdirs") is False
assert call_kwargs.get("detail") is True
assert call_kwargs.get("maxdepth") == 1
assert call_kwargs.get("maxdepth") is None

# Assert that list_folders was called with the correct request
expected_folder_id = "find_test/"
Expand Down
Loading