Skip to content

Commit 77fe8fd

Browse files
authored
Reduce reads and file exists checks (#639)
* Remove more Path-wrapping methods. * Reduce metadata reads. * Docstring, test coverage.
1 parent b6cebe1 commit 77fe8fd

File tree

13 files changed

+58
-200
lines changed

13 files changed

+58
-200
lines changed

src/hats/catalog/dataset/collection_properties.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,6 @@ def read_from_dir(cls, catalog_dir: str | Path | UPath) -> Self:
213213
new object from the contents of a ``collection.properties`` file in the directory.
214214
"""
215215
file_path = file_io.get_upath(catalog_dir) / "collection.properties"
216-
if not file_io.does_file_or_directory_exist(file_path):
217-
raise FileNotFoundError(f"No properties file found where expected: {str(file_path)}")
218216
p = Properties()
219217
with file_path.open("rb") as f:
220218
p.load(f, "utf-8")

src/hats/catalog/dataset/table_properties.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,9 +299,9 @@ def read_from_dir(cls, catalog_dir: str | Path | UPath) -> Self:
299299
"""
300300
catalog_path = file_io.get_upath(catalog_dir)
301301
file_path = catalog_path / "hats.properties"
302-
if not file_io.does_file_or_directory_exist(file_path):
302+
if not file_path.exists():
303303
file_path = catalog_path / "properties"
304-
if not file_io.does_file_or_directory_exist(file_path):
304+
if not file_path.exists():
305305
raise FileNotFoundError(f"No properties file found where expected: {str(file_path)}")
306306
p = Properties()
307307
with file_path.open("rb") as f:

src/hats/catalog/partition_info.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@ def read_from_dir(cls, catalog_base_dir: str | Path | UPath | None) -> Partition
107107
A `PartitionInfo` object with the data from the file
108108
"""
109109
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
110-
if file_io.does_file_or_directory_exist(partition_info_file):
110+
try:
111111
pixel_list = PartitionInfo._read_from_csv(partition_info_file)
112-
else:
112+
except FileNotFoundError:
113113
warnings.warn("Computing partitions from catalog parquet files. This may be slow.")
114114

115115
# Read the dataset dir to get the list of files.
@@ -200,9 +200,7 @@ def _read_from_csv(cls, partition_info_file: str | Path | UPath) -> PartitionInf
200200
PartitionInfo
201201
A `PartitionInfo` object with the data from the file
202202
"""
203-
if not file_io.does_file_or_directory_exist(partition_info_file):
204-
raise FileNotFoundError(f"No partition info found where expected: {str(partition_info_file)}")
205-
203+
partition_info_file = file_io.get_upath(partition_info_file)
206204
data_frame = file_io.load_csv_to_pandas(partition_info_file)
207205

208206
return [

src/hats/io/file_io/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,7 @@
1717
write_string_to_file,
1818
)
1919
from .file_pointer import (
20-
append_paths_to_pointer,
2120
directory_has_contents,
22-
does_file_or_directory_exist,
23-
find_files_matching_path,
2421
get_upath,
2522
get_upath_for_protocol,
26-
is_regular_file,
2723
)

src/hats/io/file_io/file_io.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,6 @@ def read_parquet_metadata(file_pointer: str | Path | UPath, **kwargs) -> pq.File
211211
parqeut file metadata (includes schema)
212212
"""
213213
file_pointer = get_upath(file_pointer)
214-
if file_pointer is None or not file_pointer.exists():
215-
raise FileNotFoundError("Parquet file does not exist")
216214
if _parquet_precache_all_bytes(file_pointer): # pragma: no cover
217215
return pq.read_metadata(BytesIO(file_pointer.read_bytes()), **kwargs)
218216

src/hats/io/file_io/file_pointer.py

Lines changed: 0 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -61,93 +61,6 @@ def get_upath_for_protocol(path: str | Path) -> UPath:
6161
return upath
6262

6363

64-
def append_paths_to_pointer(pointer: str | Path | UPath, *paths: str) -> UPath:
65-
"""Append directories and/or a file name to a specified file pointer.
66-
67-
Parameters
68-
----------
69-
pointer : str | Path | UPath
70-
`FilePointer` object to add path to
71-
*paths: str
72-
any number of directory names optionally followed by a file name to append to the
73-
pointer
74-
75-
Returns
76-
-------
77-
UPath
78-
New file pointer to path given by joining given pointer and path names
79-
"""
80-
pointer = get_upath(pointer)
81-
return pointer.joinpath(*paths)
82-
83-
84-
def does_file_or_directory_exist(pointer: str | Path | UPath) -> bool:
85-
"""Checks if a file or directory exists for a given file pointer
86-
87-
Parameters
88-
----------
89-
pointer : str | Path | UPath
90-
File Pointer to check if file or directory exists at
91-
92-
Returns
93-
-------
94-
bool
95-
True if file or directory at `pointer` exists, False if not
96-
"""
97-
pointer = get_upath(pointer)
98-
return pointer.exists()
99-
100-
101-
def is_regular_file(pointer: str | Path | UPath) -> bool:
102-
"""Checks if a regular file (NOT a directory) exists for a given file pointer.
103-
104-
Parameters
105-
----------
106-
pointer : str | Path | UPath
107-
File Pointer to check if a regular file
108-
109-
Returns
110-
-------
111-
bool
112-
True if regular file at `pointer` exists, False if not or is a directory
113-
"""
114-
pointer = get_upath(pointer)
115-
return pointer.is_file()
116-
117-
118-
def find_files_matching_path(pointer: str | Path | UPath, *paths: str) -> list[UPath]:
119-
"""Find files or directories matching the provided path parts.
120-
121-
Parameters
122-
----------
123-
pointer : str | Path | UPath
124-
base File Pointer in which to find contents
125-
*paths: str
126-
any number of directory names optionally followed by a file name.
127-
directory or file names may be replaced with `*` as a matcher.
128-
129-
Returns
130-
-------
131-
list[UPath]
132-
New file pointers to files found matching the path
133-
"""
134-
pointer = get_upath(pointer)
135-
136-
if len(paths) == 0:
137-
return [pointer]
138-
139-
matcher = pointer.fs.sep.join(paths)
140-
contents = []
141-
for child in pointer.rglob(matcher):
142-
contents.append(child)
143-
144-
if len(contents) == 0:
145-
return []
146-
147-
contents.sort()
148-
return contents
149-
150-
15164
def directory_has_contents(pointer: str | Path | UPath) -> bool:
15265
"""Checks if a directory already has some contents (any files or subdirectories)
15366

src/hats/io/parquet_metadata.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -528,12 +528,12 @@ def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | N
528528
path to a parquet file containing metadata schema.
529529
"""
530530
common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
531-
common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
531+
if common_metadata_file.exists():
532+
return common_metadata_file
532533
metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
533-
metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
534-
if not (common_metadata_exists or metadata_exists):
535-
return None
536-
return common_metadata_file if common_metadata_exists else metadata_file
534+
if metadata_file.exists():
535+
return metadata_file
536+
return None
537537

538538

539539
# pylint: disable=protected-access

src/hats/io/validation.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
from hats.catalog.margin_cache.margin_catalog import MarginCatalog
1616
from hats.catalog.partition_info import PartitionInfo
1717
from hats.io import get_common_metadata_pointer, get_parquet_metadata_pointer, get_partition_info_pointer
18-
from hats.io.file_io import does_file_or_directory_exist, get_upath
19-
from hats.io.file_io.file_pointer import is_regular_file
18+
from hats.io.file_io import get_upath
2019
from hats.io.paths import get_healpix_from_path
2120
from hats.loaders import read_hats
2221
from hats.pixel_math.healpix_pixel import INVALID_PIXEL
@@ -242,7 +241,7 @@ def _is_valid_catalog_strict(pointer, handle_error, verbose):
242241
parquet_path_pixels = []
243242
for hats_file in dataset.files:
244243
hats_fp = UPath(hats_file, protocol=metadata_file.protocol, **metadata_file.storage_options)
245-
if not does_file_or_directory_exist(hats_fp):
244+
if not hats_fp.exists():
246245
handle_error(f"Pixel partition is missing: {hats_fp}")
247246
is_valid = False
248247
healpix_pixel = get_healpix_from_path(hats_file)
@@ -291,20 +290,14 @@ def is_collection_info_valid(pointer: str | Path | UPath) -> bool:
291290

292291
def _is_partition_info_valid(pointer: UPath) -> bool:
293292
"""Checks if partition_info is valid for a given base catalog pointer"""
294-
partition_info_pointer = get_partition_info_pointer(pointer)
295-
partition_info_exists = is_regular_file(partition_info_pointer)
296-
return partition_info_exists
293+
return get_partition_info_pointer(pointer).exists()
297294

298295

299296
def _is_metadata_valid(pointer: UPath) -> bool:
300297
"""Checks if _metadata is valid for a given base catalog pointer"""
301-
metadata_file = get_parquet_metadata_pointer(pointer)
302-
metadata_file_exists = is_regular_file(metadata_file)
303-
return metadata_file_exists
298+
return get_parquet_metadata_pointer(pointer).exists()
304299

305300

306301
def _is_common_metadata_valid(pointer: UPath) -> bool:
307302
"""Checks if _common_metadata is valid for a given base catalog pointer"""
308-
metadata_file = get_common_metadata_pointer(pointer)
309-
metadata_file_exists = is_regular_file(metadata_file)
310-
return metadata_file_exists
303+
return get_common_metadata_pointer(pointer).exists()

src/hats/loaders/read_hats.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,23 @@
2929
}
3030

3131

32-
def read_hats(catalog_path: str | Path | UPath) -> CatalogCollection | Dataset:
32+
def read_hats(
33+
catalog_path: str | Path | UPath, *, single_catalog: bool | None = None, read_moc: bool = True
34+
) -> CatalogCollection | Dataset:
3335
"""Reads a HATS Catalog from a HATS directory
3436
3537
Parameters
3638
----------
3739
catalog_path : str | Path | UPath
3840
path to the root directory of the catalog
41+
single_catalog: bool
42+
If you happen to already know that the `catalog_path` points to a
43+
single catalog, instead of a catalog collection, this flag can
44+
save a few file read operations.
45+
read_moc: bool
46+
If you happen to know that your catalog does not have a MOC (or if
47+
you know that your use case will not utilize a MOC), then you can
48+
skip the file read and memory load of the MOC.
3949
4050
Returns
4151
-------
@@ -50,20 +60,26 @@ def read_hats(catalog_path: str | Path | UPath) -> CatalogCollection | Dataset:
5060
catalog = hats.read_hats(UPath(..., anon=True))
5161
"""
5262
path = file_io.get_upath(catalog_path)
63+
if single_catalog is not None:
64+
if single_catalog:
65+
return _load_catalog(path, read_moc=read_moc)
66+
return _load_collection(path, read_moc=read_moc)
5367
if (path / "hats.properties").exists() or (path / "properties").exists():
54-
return _load_catalog(path)
68+
return _load_catalog(path, read_moc=read_moc)
5569
if (path / "collection.properties").exists():
56-
return _load_collection(path)
70+
return _load_collection(path, read_moc=read_moc)
5771
raise FileNotFoundError(f"Failed to read HATS at location {catalog_path}")
5872

5973

60-
def _load_collection(collection_path: UPath) -> CatalogCollection:
74+
def _load_collection(collection_path: UPath, read_moc: bool = True) -> CatalogCollection:
6175
collection_properties = CollectionProperties.read_from_dir(collection_path)
62-
main_catalog = _load_catalog(collection_path / collection_properties.hats_primary_table_url)
76+
main_catalog = _load_catalog(
77+
collection_path / collection_properties.hats_primary_table_url, read_moc=read_moc
78+
)
6379
return CatalogCollection(collection_path, collection_properties, main_catalog)
6480

6581

66-
def _load_catalog(catalog_path: UPath) -> Dataset:
82+
def _load_catalog(catalog_path: UPath, read_moc: bool = True) -> Dataset:
6783
properties = TableProperties.read_from_dir(catalog_path)
6884
dataset_type = properties.catalog_type
6985
if dataset_type not in DATASET_TYPE_TO_CLASS:
@@ -78,7 +94,8 @@ def _load_catalog(catalog_path: UPath) -> Dataset:
7894
}
7995
if _is_healpix_dataset(dataset_type):
8096
kwargs["pixels"] = PartitionInfo.read_from_dir(catalog_path)
81-
kwargs["moc"] = _read_moc_from_point_map(catalog_path)
97+
if read_moc:
98+
kwargs["moc"] = _read_moc_from_point_map(catalog_path)
8299
return loader(**kwargs)
83100

84101

@@ -95,7 +112,7 @@ def _is_healpix_dataset(dataset_type):
95112
def _read_moc_from_point_map(catalog_base_dir: str | Path | UPath) -> MOC | None:
96113
"""Reads a MOC object from the `point_map.fits` file if it exists in the catalog directory"""
97114
point_map_path = paths.get_point_map_file_pointer(catalog_base_dir)
98-
if not file_io.does_file_or_directory_exist(point_map_path):
115+
if not point_map_path.exists():
99116
return None
100117
fits_image = file_io.read_fits_image(point_map_path)
101118
order = hp.npix2order(len(fits_image))

tests/hats/catalog/loaders/test_read_hats.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,18 +88,20 @@ def test_read_hats_collection_info_only(collection_path):
8888

8989
def test_read_hats_branches(
9090
small_sky_dir,
91+
small_sky_collection_dir,
9192
small_sky_order1_dir,
9293
association_catalog_path,
9394
small_sky_source_object_index_dir,
9495
margin_catalog_path,
9596
small_sky_source_dir,
9697
test_data_dir,
9798
):
98-
read_hats(small_sky_dir)
99+
read_hats(small_sky_dir, single_catalog=True)
100+
read_hats(small_sky_collection_dir, single_catalog=False)
99101
read_hats(small_sky_order1_dir)
100102
read_hats(association_catalog_path)
101103
read_hats(small_sky_source_object_index_dir)
102-
read_hats(margin_catalog_path)
104+
read_hats(margin_catalog_path, read_moc=False)
103105
read_hats(small_sky_source_dir)
104106
read_hats(test_data_dir / "square_map")
105107
read_hats(test_data_dir / "small_sky_healpix13")

0 commit comments

Comments
 (0)