Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cloudbuild/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ case "$TEST_SUITE" in
# - test_core_versioned.py: HNS buckets do not support versioning.
# - test_core.py::test_sign: Current Cloud Build auth setup does not support this.
# - test_core.py::test_mv_file_cache: Integration test only applicable for regional buckets.
# - test_core.py::test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
pytest "${ARGS[@]}" gcsfs/ \
--deselect gcsfs/tests/test_extended_gcsfs.py \
--deselect gcsfs/tests/test_zonal_file.py \
--deselect gcsfs/tests/test_extended_gcsfs_unit.py \
--deselect gcsfs/tests/test_core_versioned.py \
--deselect gcsfs/tests/test_core.py::test_sign \
--deselect gcsfs/tests/test_core.py::test_mv_file_cache
--deselect gcsfs/tests/test_core.py::test_mv_file_cache \
--deselect gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive
;;

"zonal-core")
Expand Down Expand Up @@ -107,6 +109,7 @@ case "$TEST_SUITE" in
# - test_array fails due to CRC32C TypeError with array objects.
# - test_sign fails because it requires a private key
# - test_mv_file_cache: Integration test only applicable for regional buckets.
# - test_rm_wildcards_non_recursive: HNS buckets have different behavior for non-recursive wildcard deletion.
ZONAL_DESELECTS+=(
"--deselect=gcsfs/tests/test_core.py::test_flush"
"--deselect=gcsfs/tests/test_core.py::test_write_blocks"
Expand All @@ -115,6 +118,7 @@ case "$TEST_SUITE" in
"--deselect=gcsfs/tests/test_core.py::test_array"
"--deselect=gcsfs/tests/test_core.py::test_sign"
"--deselect=gcsfs/tests/test_core.py::test_mv_file_cache"
"--deselect=gcsfs/tests/test_core.py::test_rm_wildcards_non_recursive"
)

pytest "${ARGS[@]}" "${ZONAL_DESELECTS[@]}" gcsfs/tests/test_core.py
Expand Down
3 changes: 2 additions & 1 deletion docs/source/hns_buckets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ While ``gcsfs`` aims to abstract the differences via the ``fsspec`` API, you sho
2. **``mkdir`` behavior:** Previously, in a flat namespace, calling ``mkdir`` on a path could only ensure the underlying bucket exists. With HNS enabled, calling ``mkdir`` will create an actual folder resource in GCS. Furthermore, if you want to create nested folders (eg: bucket/a/b/c/d), pass ``create_parents=True``, it will physically create all intermediate folder resources along the specified path.
3. **No mixing or toggling:** You cannot toggle HNS on an existing flat-namespace bucket. You must create a new HNS bucket and migrate your data.
4. **Object naming:** Object names in HNS cannot end with a slash (``/``) unless without the creation of physical folder resources.
5. **Rename Operation Benchmarks**
5. **Non-Recursive Wildcard Deletions:** When using wildcard deletions without recursion (e.g., ``gcs.rm("bucket/dir/*", recursive=False)``), if the pattern matches a non-empty directory, HNS buckets will raise an error (typically surfaced as an ``OSError`` due to a failed precondition). In contrast, standard flat-namespace buckets silently ignore non-empty directories under the same circumstances.
6. **Rename Operation Benchmarks**

The following benchmarks show the time taken (in seconds) to rename a directory containing a large number of files (spread across 256 folders and 8 levels) in a standard Regional bucket versus an HNS bucket (can be replicated using `gcsfs/tests/perf/microbenchmarks/rename`):

Expand Down
8 changes: 6 additions & 2 deletions gcsfs/extended_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ async def _expand_path_with_details(
raise ValueError("maxdepth must be at least 1")

if isinstance(path, str):
out = await self._expand_path_with_details(
return await self._expand_path_with_details(
[path], recursive, maxdepth, detail=detail
)
else:
Expand Down Expand Up @@ -1000,6 +1000,8 @@ async def _find(
For buckets with flat structure, it falls back to the parent implementation.
"""
path = self._strip_protocol(path)
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
bucket, _, _ = self.split_path(path)

is_hns = await self._is_bucket_hns_enabled(bucket)
Expand All @@ -1017,14 +1019,16 @@ async def _find(

# Hybrid approach for HNS enabled buckets
# 1. Fetch all files from super find() method by passing withdirs as False.
# We pass maxdepth as None here to ensure we fetch all files for caching,
# and then filter by maxdepth at the end of this method.
files_task = asyncio.create_task(
super()._find(
path,
withdirs=False, # Fetch files only
detail=True, # Get full details for merging and populating cache
prefix=prefix,
versions=versions,
maxdepth=maxdepth,
maxdepth=None,
update_cache=False, # Defer caching until merging files and folders
**kwargs,
)
Expand Down
82 changes: 82 additions & 0 deletions gcsfs/tests/integration/test_extended_hns.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,66 @@ def test_rm_empty_folder_cache_invalidation(self, gcs_hns):
sibling_dir in gcsfs.dircache
), "Sibling directory cache should not be invalidated."

def test_rm_wildcard_with_empty_folders(self, gcs_hns):
"""Test deleting files and empty folders using wildcards on HNS bucket."""
gcsfs = gcs_hns
base_dir = f"{TEST_HNS_BUCKET}/rm_wildcard_empty_{uuid.uuid4().hex}"

# Setup: Some files, some empty folders, some non-empty folders
empty_dir1 = f"{base_dir}/empty_1"
empty_dir2 = f"{base_dir}/empty_2"
empty_dir3 = f"{base_dir}/one_more_empty_dir"
other_dir = f"{base_dir}/other_dir"
file1 = f"{base_dir}/file1.txt"

gcsfs.mkdir(empty_dir1, create_parents=True)
gcsfs.mkdir(empty_dir2)
gcsfs.touch(file1)
gcsfs.mkdir(empty_dir3)
gcsfs.touch(f"{other_dir}/file2.txt")

# 1. Remove only matching empty folders using wildcard
gcsfs.rm(f"{base_dir}/empty_*", recursive=True)

assert not gcsfs.exists(empty_dir1)
assert not gcsfs.exists(empty_dir2)
assert gcsfs.exists(file1)
assert gcsfs.exists(other_dir)

# 2. Remove remaining using a broad wildcard
gcsfs.rm(f"{base_dir}/*", recursive=True)

assert not gcsfs.exists(file1)
assert not gcsfs.exists(other_dir)
assert not gcsfs.exists(empty_dir3)

# Verify base_dir is now empty (HNS might still have the base_dir itself)
assert gcsfs.ls(base_dir) == []

gcsfs.rm(base_dir, recursive=True)

assert not gcsfs.exists(base_dir)

def test_rm_wildcards_non_recursive(self, gcs_hns):
"""Test 'base_dir/*' (non-recursive) raises an error on non-empty directories for HNS buckets."""
gcsfs = gcs_hns
base_dir = f"{TEST_HNS_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
files = [
f"{base_dir}/other.txt",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcsfs.touch(f)

# 3. Test 'base_dir/*' (non-recursive)
# For HNS, this matches `other.txt` and `subdir`. Because `subdir` is not empty,
# delete_folder raises FailedPrecondition, which currently wraps to OSError.
with pytest.raises(OSError, match="Pre condition failed"):
gcsfs.rm(f"{base_dir}/*")

assert not gcsfs.exists(files[0])
assert gcsfs.exists(files[1]) # subdir file still exists


@pytest.fixture()
def test_structure(gcs_hns):
Expand Down Expand Up @@ -1052,6 +1112,28 @@ def test_find_updates_dircache_without_prefix(
}
assert not empty_dir_listing

def test_find_maxdepth_updates_cache(self, gcs_hns, test_structure):
"""Test that find with maxdepth updates cache for deeper objects."""
base_dir = test_structure["base_dir"]
gcs_hns.invalidate_cache()
assert not gcs_hns.dircache

# Run find with maxdepth=1
# This should return root_file, empty_dir, dir_with_files
# But it should also cache objects within dir_with_files, etc.
result = gcs_hns.find(base_dir, maxdepth=1)
assert test_structure["root_file"] in result
assert test_structure["file1"] not in result

# Verify that the cache is populated for deeper objects even if not returned
# from find due to maxdepth filter
assert test_structure["dir_with_files"] in gcs_hns.dircache
dir_with_files_cache = {
d["name"] for d in gcs_hns.dircache[test_structure["dir_with_files"]]
}
assert test_structure["file1"] in dir_with_files_cache
assert test_structure["nested_dir"] in dir_with_files_cache

def test_find_does_not_update_dircache_with_prefix(self, gcs_hns, test_structure):
"""Test that find() does NOT populate the dircache when a prefix is given."""
base_dir = test_structure["base_dir"]
Expand Down
81 changes: 81 additions & 0 deletions gcsfs/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,87 @@ def test_rm_chunked_batch(gcs):
assert fn not in files_removed


def test_rm_wildcards_in_directory(gcs):
base_dir = f"{TEST_BUCKET}/test_rm_complex_{uuid.uuid4().hex}"
files = [
f"{base_dir}/file1.txt",
f"{base_dir}/file2.txt",
f"{base_dir}/other.txt",
f"{base_dir}/a1.dat",
f"{base_dir}/b1.dat",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcs.touch(f)

# 1. Test '?' wildcard (non-recursive)
gcs.rm(f"{base_dir}/file?.txt")
assert not gcs.exists(files[0])
assert not gcs.exists(files[1])
assert gcs.exists(files[2])
assert gcs.exists(files[5])

# 2. Test list of patterns with '*' wildcards (non-recursive)
gcs.rm([f"{base_dir}/a*", f"{base_dir}/b*"])
assert not gcs.exists(files[3])
assert not gcs.exists(files[4])
assert gcs.exists(files[2])
assert gcs.exists(files[5]) # subdir file still exists

# 3. Test non-matching pattern
with pytest.raises(FileNotFoundError):
gcs.rm(f"{base_dir}/non_existent*")

# 4. Test recursive wildcard cleanup
gcs.rm(f"{base_dir}/**", recursive=True)
assert not gcs.exists(base_dir)


def test_rm_wildcard_bucket_non_recursive(gcs):
"""Tests non-recursive rm with a wildcard at the bucket root."""
# Test 'bucket/*' (non-recursive)
new_bucket = f"gcsfs-test-rm-{uuid.uuid4().hex}"
gcs.mkdir(new_bucket)
try:
gcs.touch(f"{new_bucket}/file1")
gcs.touch(f"{new_bucket}/subdir/file2")
gcs.rm(f"{new_bucket}/*")
assert not gcs.exists(f"{new_bucket}/file1")
assert gcs.exists(f"{new_bucket}/subdir/file2")
finally:
gcs.rm(new_bucket, recursive=True)


def test_rm_wildcard_bucket_recursive(gcs):
"""Tests recursive rm with a wildcard at the bucket root."""
# Test 'bucket/*' (recursive)
new_bucket = f"gcsfs-test-rm-recursive-{uuid.uuid4().hex}"
gcs.mkdir(new_bucket)
try:
gcs.touch(f"{new_bucket}/file1")
gcs.touch(f"{new_bucket}/subdir/file2")
gcs.rm(f"{new_bucket}/*", recursive=True)
assert not gcs.exists(f"{new_bucket}/file1")
assert not gcs.exists(f"{new_bucket}/subdir/file2")
finally:
if gcs.exists(new_bucket):
gcs.rm(new_bucket, recursive=True)


def test_rm_wildcards_non_recursive(gcs):
base_dir = f"{TEST_BUCKET}/test_rm_asterisk_{uuid.uuid4().hex}"
files = [
f"{base_dir}/other.txt",
f"{base_dir}/subdir/nested.txt",
]
for f in files:
gcs.touch(f)

gcs.rm(f"{base_dir}/*")
assert not gcs.exists(files[0])
assert gcs.exists(files[1]) # subdir file still exists


def test_file_access(gcs):
fn = TEST_BUCKET + "/nested/file1"
data = b"hello\n"
Expand Down
2 changes: 1 addition & 1 deletion gcsfs/tests/test_extended_hns_gcsfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ def test_hns_find_withdirs_maxdepth(self, gcs_hns, gcs_hns_mocks):
_, call_kwargs = mocks["super_find"].call_args
assert call_kwargs.get("withdirs") is False
assert call_kwargs.get("detail") is True
assert call_kwargs.get("maxdepth") == 1
assert call_kwargs.get("maxdepth") is None

# Assert that list_folders was called with the correct request
expected_folder_id = "find_test/"
Expand Down
Loading