diff --git a/.copier-answers.yml b/.copier-answers.yml index e7a8798b..88e04b91 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -20,7 +20,6 @@ project_license: BSD project_name: hats project_organization: astronomy-commons python_versions: -- '3.10' - '3.11' - '3.12' - '3.13' diff --git a/.github/workflows/asv-main.yml b/.github/workflows/asv-main.yml index 34b3ced7..957e6322 100644 --- a/.github/workflows/asv-main.yml +++ b/.github/workflows/asv-main.yml @@ -8,7 +8,7 @@ on: branches: [ main ] env: - PYTHON_VERSION: "3.11" + PYTHON_VERSION: "3.12" ASV_VERSION: "0.6.5" WORKING_DIR: ${{github.workspace}}/benchmarks diff --git a/.github/workflows/asv-nightly.yml b/.github/workflows/asv-nightly.yml index d0dbf4ce..00b89972 100644 --- a/.github/workflows/asv-nightly.yml +++ b/.github/workflows/asv-nightly.yml @@ -9,7 +9,7 @@ on: workflow_dispatch: env: - PYTHON_VERSION: "3.11" + PYTHON_VERSION: "3.12" ASV_VERSION: "0.6.5" WORKING_DIR: ${{github.workspace}}/benchmarks NIGHTLY_HASH_FILE: nightly-hash diff --git a/.github/workflows/asv-pr.yml b/.github/workflows/asv-pr.yml index e52157fd..23acf458 100644 --- a/.github/workflows/asv-pr.yml +++ b/.github/workflows/asv-pr.yml @@ -15,7 +15,7 @@ concurrency: cancel-in-progress: true env: - PYTHON_VERSION: "3.11" + PYTHON_VERSION: "3.12" ASV_VERSION: "0.6.5" WORKING_DIR: ${{github.workspace}}/benchmarks ARTIFACTS_DIR: ${{github.workspace}}/artifacts diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml index c718066f..a8c85bf6 100644 --- a/.github/workflows/pre-commit-ci.yml +++ b/.github/workflows/pre-commit-ci.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.12' - name: Install uv uses: astral-sh/setup-uv@v7 - name: Install dependencies diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 6146a752..7262e296 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -26,7 +26,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 63c161e8..bfa56170 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.10', '3.11', '3.12', '3.13'] + python-version: ['3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v5 diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml index 2c18f6a8..72bfde78 100644 --- a/.github/workflows/testing-and-coverage.yml +++ b/.github/workflows/testing-and-coverage.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.10', '3.11', '3.12', '3.13'] + python-version: ['3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v5 @@ -43,10 +43,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - - name: Set up Python 3.10 + - name: Set up Python 3.11 uses: actions/setup-python@v6 with: - python-version: '3.10' + python-version: '3.11' - name: Install dependencies run: | sudo apt-get update diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e6a5486..351e9541 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -94,7 +94,7 @@ repos: # supported by your project here, or alternatively use # pre-commit's default_language_version, see # https://pre-commit.com/#top_level-default_language_version - language_version: python3.11 + language_version: python3.12 # Make sure Sphinx can build the documentation while explicitly omitting # notebooks from the docs, so users don't have to wait through the execution # of each notebook or each commit. By default, these will be checked in the diff --git a/.readthedocs.yml b/.readthedocs.yml index e83928da..578fd870 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.11" + python: "3.12" # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index 55ab772f..31572c88 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -37,7 +37,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. "pythons": [ - "3.11" + "3.12" ], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty diff --git a/pyproject.toml b/pyproject.toml index 0664cacf..bd9b3b17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,10 +16,10 @@ classifiers = [ "Programming Language :: Python", ] dynamic = ["version"] -requires-python = ">=3.10" +requires-python = ">=3.11" dependencies = [ "aiohttp>=3.8.0", # http filesystem support - "astropy>=6.1.5", + "astropy>=7.0.0", "cdshealpix>=0.7.0", "fsspec>=2023.10.0", # Used for abstract filesystems "jproperties>=2.0.0", @@ -84,7 +84,7 @@ omit=["src/hats/_version.py"] [tool.black] line-length = 110 -target-version = ["py310"] +target-version = ["py311"] [tool.isort] profile = "black" line_length = 110 diff --git a/src/.pylintrc b/src/.pylintrc index ef8deb6d..38327354 100644 --- a/src/.pylintrc +++ b/src/.pylintrc @@ -87,7 +87,7 @@ persistent=yes # Minimum Python version to use for version dependent checks. Will default to # the version used to run pylint. -py-version=3.10 +py-version=3.11 # Discover python modules and packages in the file system subtree. recursive=no diff --git a/src/hats/io/parquet_metadata.py b/src/hats/io/parquet_metadata.py index d8d7cbce..37b12a5d 100644 --- a/src/hats/io/parquet_metadata.py +++ b/src/hats/io/parquet_metadata.py @@ -2,14 +2,17 @@ from __future__ import annotations +import io import random from pathlib import Path +import nested_pandas as npd import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.dataset as pds import pyarrow.parquet as pq +from astropy.io.votable.tree import FieldRef, Group, Param, VOTableFile +from astropy.table import Table from upath import UPath from hats.io import file_io, paths @@ -130,29 +133,6 @@ def write_parquet_metadata( return total_rows -def read_row_group_fragments(metadata_file: str): - """Generator for metadata fragment row groups in a parquet metadata file. - - Parameters - ---------- - metadata_file : str - path to `_metadata` file. - - Yields - ------ - RowGroupFragment - metadata for individual row groups - """ - metadata_file = get_upath(metadata_file) - if not file_io.is_regular_file(metadata_file): - metadata_file = paths.get_parquet_metadata_pointer(metadata_file) - - dataset = pds.parquet_dataset(metadata_file, filesystem=metadata_file.fs) - - for frag in dataset.get_fragments(): - yield from frag.row_groups - - def _nonemin(value1, value2): """Similar to numpy's nanmin, but excludes `None` values. @@ -489,3 +469,204 @@ def per_pixel_statistics( {stat_name: int for stat_name in int_col_names} ) return frame + + +def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | None: + """Determines the appropriate file to read for parquet metadata + stored in the _common_metadata or _metadata files. + + Parameters + ---------- + catalog_base_dir : str | Path | UPath + base path for the catalog + + Returns + ------- + UPath | None + path to a parquet file containing metadata schema. + """ + common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir) + common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file) + metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir) + metadata_exists = file_io.does_file_or_directory_exist(metadata_file) + if not (common_metadata_exists or metadata_exists): + return None + return common_metadata_file if common_metadata_exists else metadata_file + + +# pylint: disable=protected-access +def pa_schema_to_vo_schema( + catalog_base_dir: str | Path | UPath, + *, + verbose: bool = False, + field_units: dict | None = None, + field_ucds: dict | None = None, + field_descriptions: dict | None = None, + field_utypes: dict | None = None, +): + """Create VOTableFile metadata, based on the names and types of fields in the parquet files. + + Add ancillary attributes to fields where they are provided in the optional dictionaries. + + Note on field names with nested columns: to include ancillary attributes (units, ucds, etc) + for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary + attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``). + + Parameters + ---------- + catalog_base_dir : str | Path | UPath + base path for the catalog + verbose: bool + Should we print out additional debugging statements about the vo metadata? + field_units: dict | None + dictionary mapping column names to astropy units (or string representation of units) + field_ucds: dict | None + dictionary mapping column names to UCDs (Uniform Content Descriptors) + field_descriptions: dict | None + dictionary mapping column names to free-text descriptions + field_utypes: dict | None + dictionary mapping column names to utypes + + Returns + ------- + VOTableFile + VO object containing all relevant metadata (but no data) + """ + schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir) + if not schema_file: + return None + + field_units = field_units or {} + field_ucds = field_ucds or {} + field_descriptions = field_descriptions or {} + field_utypes = field_utypes or {} + + ## Try to find VO metadata in the file: + # metadata = file_io.read_parquet_metadata(schema_file) + nested_schema = npd.read_parquet(schema_file) + + df_types = nested_schema.to_pandas().dtypes + names = [] + data_types = [] + for col in nested_schema.base_columns: + names.append(col) + type_str = str(df_types[col]).split("[", maxsplit=1)[0] + data_types.append(type_str) + + for col in nested_schema.nested_columns: + for key, val in nested_schema[col].dtype.column_dtypes.items(): + names.append(f"{col}.{key}") + data_types.append(str(val)) + data_types = ["U" if t == "string" else t for t in data_types] + + # Might have extra content for nested columns. + named_descriptions = {key: field_descriptions[key] for key in field_descriptions if key in names} + named_units = {key: field_units[key] for key in field_units if key in names} + if verbose: + dropped_keys_units = set(field_units.keys()) - set(named_units.keys()) + dropped_keys_desc = set(field_descriptions.keys()) - set(named_descriptions.keys()) + if dropped_keys_units or dropped_keys_desc: + print("================== Extra Fields ==================") + if dropped_keys_units: + print(f"warning - dropping some units ({len(dropped_keys_units)}):") + print(dropped_keys_units) + if dropped_keys_desc: + print(f"warning - dropping some descriptions ({len(dropped_keys_desc)}):") + print(dropped_keys_desc) + + t = Table(names=names, dtype=data_types, units=named_units, descriptions=named_descriptions) + + votablefile = VOTableFile() + votablefile = votablefile.from_table(t) + + ## TODO - add info to root resource, e.g. obsregime. + + ## Add groups for nested columns + vo_table = votablefile.get_first_table() + for col in nested_schema.nested_columns: + new_group = Group(vo_table, name=col, config=vo_table._config, pos=vo_table._pos) + if col in field_descriptions: + new_group.description = field_descriptions[col] + else: + new_group.description = "multi-column nested format" + vo_table.groups.append(new_group) + + new_param = Param(vo_table, name="is_nested_column", datatype="boolean", value="t") + new_group.entries.append(new_param) + + for key in nested_schema[col].columns: + new_field = FieldRef(vo_table, ref=f"{col}.{key}") + new_group.entries.append(new_field) + + ## Go back and add UCD/utypes to fields + for field in vo_table.iter_fields_and_params(): + field_name = field.name + if field_name in field_ucds: + field.ucd = field_ucds[field_name] + if field_name in field_utypes: + field.utype = field_utypes[field_name] + return votablefile + + +def write_voparquet_in_common_metadata( + catalog_base_dir: str | Path | UPath, + *, + verbose: bool = False, + field_units: dict | None = None, + field_ucds: dict | None = None, + field_descriptions: dict | None = None, + field_utypes: dict | None = None, +): + """Create VOTableFile metadata, based on the names and types of fields in the parquet files, + and write to a ``catalog_base_dir/dataset/_common_metadata`` parquet file. + + Add ancillary attributes to fields where they are provided in the optional dictionaries. + + Note on field names with nested columns: to include ancillary attributes (units, ucds, etc) + for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary + attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``). + + Parameters + ---------- + catalog_base_dir : str | Path | UPath + base path for the catalog + verbose: bool + Should we print out additional debugging statements about the vo metadata? + field_units: dict | None + dictionary mapping column names to astropy units (or string representation of units) + field_ucds: dict | None + dictionary mapping column names to UCDs (Uniform Content Descriptors) + field_descriptions: dict | None + dictionary mapping column names to free-text descriptions + field_utypes: dict | None + dictionary mapping column names to utypes + """ + votablefile = pa_schema_to_vo_schema( + verbose=verbose, + catalog_base_dir=catalog_base_dir, + field_units=field_units, + field_ucds=field_ucds, + field_descriptions=field_descriptions, + field_utypes=field_utypes, + ) + + xml_bstr = io.BytesIO() + votablefile.to_xml(xml_bstr) + xml_str = xml_bstr.getvalue().decode("utf-8") + if verbose: + print("================== Table XML ==================") + print(xml_str) + + common_metadata_file_pointer = paths.get_common_metadata_pointer(catalog_base_dir) + + pa_schema = file_io.read_parquet_metadata(common_metadata_file_pointer).schema.to_arrow_schema() + + original_metadata = pa_schema.metadata or {} + updated_metadata = original_metadata | { + b"IVOA.VOTable-Parquet.version": b"1.0", + b"IVOA.VOTable-Parquet.content": xml_str, + } + + pa_schema = pa_schema.with_metadata(updated_metadata) + + file_io.write_parquet_metadata(pa_schema, common_metadata_file_pointer) diff --git a/src/hats/loaders/read_hats.py b/src/hats/loaders/read_hats.py index 51edc62f..0f3a812a 100644 --- a/src/hats/loaders/read_hats.py +++ b/src/hats/loaders/read_hats.py @@ -17,6 +17,7 @@ from hats.catalog.partition_info import PartitionInfo from hats.io import file_io, paths from hats.io.file_io import read_parquet_metadata +from hats.io.parquet_metadata import pick_metadata_schema_file DATASET_TYPE_TO_CLASS = { CatalogType.OBJECT: Catalog, @@ -106,16 +107,12 @@ def _read_moc_from_point_map(catalog_base_dir: str | Path | UPath) -> MOC | None def _read_schema_from_metadata(catalog_base_dir: str | Path | UPath) -> pa.Schema | None: """Reads the schema information stored in the _common_metadata or _metadata files.""" - common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir) - common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file) - metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir) - metadata_exists = file_io.does_file_or_directory_exist(metadata_file) - if not (common_metadata_exists or metadata_exists): + schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir) + if not schema_file: warnings.warn( "_common_metadata or _metadata files not found for this catalog." "The arrow schema will not be set." ) return None - schema_file = common_metadata_file if common_metadata_exists else metadata_file metadata = read_parquet_metadata(schema_file) return metadata.schema.to_arrow_schema() diff --git a/tests/.pylintrc b/tests/.pylintrc index 35a1feb9..16368327 100644 --- a/tests/.pylintrc +++ b/tests/.pylintrc @@ -87,7 +87,7 @@ persistent=yes # Minimum Python version to use for version dependent checks. Will default to # the version used to run pylint. -py-version=3.10 +py-version=3.11 # Discover python modules and packages in the file system subtree. recursive=no diff --git a/tests/hats/io/test_parquet_metadata.py b/tests/hats/io/test_parquet_metadata.py index 4cfd2fd4..8c2e47c7 100644 --- a/tests/hats/io/test_parquet_metadata.py +++ b/tests/hats/io/test_parquet_metadata.py @@ -2,15 +2,23 @@ import shutil +import astropy.units as u import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest +from astropy.io.misc.parquet import read_parquet_votable from pandas.api.types import is_numeric_dtype from pyarrow.parquet import ParquetFile from hats.io import file_io, paths -from hats.io.parquet_metadata import aggregate_column_statistics, per_pixel_statistics, write_parquet_metadata +from hats.io.parquet_metadata import ( + aggregate_column_statistics, + pa_schema_to_vo_schema, + per_pixel_statistics, + write_parquet_metadata, + write_voparquet_in_common_metadata, +) from hats.pixel_math.healpix_pixel import HealpixPixel from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN @@ -432,3 +440,135 @@ def test_per_pixel_statistics_with_rowgroups_empty_result(small_sky_source_dir): partition_info_file, include_pixels=[HealpixPixel(1, 4)], multi_index=True ) assert len(result_frame) == 0 + + +def test_pa_schema_to_vo_schema_small(small_sky_nested_dir): + dec_utype = "stc:AstroCoords.Position2D.Value2.C2" + return_value = pa_schema_to_vo_schema( + small_sky_nested_dir, + field_units={ + "ra": "deg", + "dec": u.deg, + "lc.source_ra": "deg", + "lc.source_dec": u.deg, + "lc.object_ra": "deg", + "lc.object_dec": u.deg, + "does_not_exist": "deg**2", + }, + field_ucds={ + "ra": "pos.eq.ra", + "dec": "pos.eq.dec", + "lc.source_ra": "pos.eq.ra", + "lc.source_dec": "pos.eq.dec", + "lc.object_ra": "pos.eq.ra", + "lc.object_dec": "pos.eq.dec", + "does_not_exist": "pos.eq.dec", + }, + field_descriptions={ + "ra": "Object ICRS Right Ascension", + "dec": "Object ICRS Declination", + "lc.source_ra": "Object ICRS Right Ascension", + "lc.source_dec": "Object ICRS Declination", + "lc.object_ra": "Object ICRS Right Ascension", + "lc.object_dec": "Object ICRS Declination", + "lc": "Properties of transient-object detections on the single-epoch difference images", + "lc.band": "Band used to take this observation", + "does_not_exist": "Band used to take this observation", + }, + field_utypes={ + "ra": "stc:AstroCoords.Position2D.Value2.C1", + "dec": dec_utype, + "does_not_exist": dec_utype, + }, + ) + assert return_value + dec_value = next(return_value.get_first_table().get_fields_by_utype(dec_utype)) + assert dec_value.name == "dec" + assert dec_value.datatype == "double" + assert dec_value.unit == "deg" + assert dec_value.utype == dec_utype + + +def test_write_voparquet_in_common_metadata_verbosity(small_sky_nested_dir, tmp_path, capsys): + catalog_base_dir = tmp_path / "catalog" + shutil.copytree( + small_sky_nested_dir, + catalog_base_dir, + ) + + dec_utype = "stc:AstroCoords.Position2D.Value2.C2" + field_kwargs = { + "field_units": { + "ra": "deg", + "dec": u.deg, + "does_not_exist": "deg**2", + }, + "field_ucds": { + "ra": "pos.eq.ra", + "dec": "pos.eq.dec", + "does_not_exist": "pos.eq.dec", + }, + "field_descriptions": { + "ra": "Object ICRS Right Ascension", + "dec": "Object ICRS Declination", + "does_not_exist": "Band used to take this observation", + }, + "field_utypes": { + "ra": "stc:AstroCoords.Position2D.Value2.C1", + "dec": dec_utype, + "does_not_exist": dec_utype, + }, + } + + ## No verbosity - nothing printed + write_voparquet_in_common_metadata(catalog_base_dir, **field_kwargs) + + captured = capsys.readouterr().out + assert captured == "" + + ## Yes verbosity - print a few warnings and full XML + write_voparquet_in_common_metadata(catalog_base_dir, verbose=True, **field_kwargs) + + captured = capsys.readouterr().out + assert "Extra Fields" in captured + assert "dropping some units" in captured + assert "dropping some descriptions" in captured + assert dec_utype in captured + # Default description for a nested field. + assert "multi-column nested format" in captured + + +def test_write_voparquet_in_common_metadata_small(small_sky_order1_dir, tmp_path): + catalog_base_dir = tmp_path / "catalog" + shutil.copytree( + small_sky_order1_dir, + catalog_base_dir, + ) + + dec_utype = "stc:AstroCoords.Position2D.Value2.C2" + write_voparquet_in_common_metadata( + catalog_base_dir, + field_units={ + "ra": "deg", + "dec": u.deg, + "does_not_exist": "deg**2", + }, + field_ucds={ + "ra": "pos.eq.ra", + "dec": "pos.eq.dec", + "does_not_exist": "pos.eq.dec", + }, + field_descriptions={ + "ra": "Object ICRS Right Ascension", + "dec": "Object ICRS Declination", + "does_not_exist": "Band used to take this observation", + }, + field_utypes={ + "ra": "stc:AstroCoords.Position2D.Value2.C1", + "dec": dec_utype, + "does_not_exist": dec_utype, + }, + ) + + table = read_parquet_votable(catalog_base_dir / "dataset" / "_common_metadata") + assert table.colnames == ["_healpix_29", "id", "ra", "dec", "ra_error", "dec_error"]