Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .copier-answers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ project_license: BSD
project_name: hats
project_organization: astronomy-commons
python_versions:
- '3.10'
- '3.11'
- '3.12'
- '3.13'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/asv-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
branches: [ main ]

env:
PYTHON_VERSION: "3.11"
PYTHON_VERSION: "3.12"
ASV_VERSION: "0.6.5"
WORKING_DIR: ${{github.workspace}}/benchmarks

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/asv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
workflow_dispatch:

env:
PYTHON_VERSION: "3.11"
PYTHON_VERSION: "3.12"
ASV_VERSION: "0.6.5"
WORKING_DIR: ${{github.workspace}}/benchmarks
NIGHTLY_HASH_FILE: nightly-hash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/asv-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ concurrency:
cancel-in-progress: true

env:
PYTHON_VERSION: "3.11"
PYTHON_VERSION: "3.12"
ASV_VERSION: "0.6.5"
WORKING_DIR: ${{github.workspace}}/benchmarks
ARTIFACTS_DIR: ${{github.workspace}}/artifacts
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
python-version: '3.12'
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Install dependencies
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12', '3.13']
python-version: ['3.11', '3.12', '3.13']

steps:
- uses: actions/checkout@v5
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/testing-and-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12', '3.13']
python-version: ['3.11', '3.12', '3.13']

steps:
- uses: actions/checkout@v5
Expand All @@ -43,10 +43,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Set up Python 3.10
- name: Set up Python 3.11
uses: actions/setup-python@v6
with:
python-version: '3.10'
python-version: '3.11'
- name: Install dependencies
run: |
sudo apt-get update
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ repos:
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.11
language_version: python3.12
# Make sure Sphinx can build the documentation while explicitly omitting
# notebooks from the docs, so users don't have to wait through the execution
# of each notebook or each commit. By default, these will be checked in the
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
python: "3.12"

# Build documentation in the docs/ directory with Sphinx
sphinx:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
"pythons": [
"3.11"
"3.12"
],
// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ classifiers = [
"Programming Language :: Python",
]
dynamic = ["version"]
requires-python = ">=3.10"
requires-python = ">=3.11"
dependencies = [
"aiohttp>=3.8.0", # http filesystem support
"astropy>=6.1.5",
"astropy>=7.0.0",
"cdshealpix>=0.7.0",
"fsspec>=2023.10.0", # Used for abstract filesystems
"jproperties>=2.0.0",
Expand Down Expand Up @@ -84,7 +84,7 @@ omit=["src/hats/_version.py"]

[tool.black]
line-length = 110
target-version = ["py310"]
target-version = ["py311"]
[tool.isort]
profile = "black"
line_length = 110
Expand Down
2 changes: 1 addition & 1 deletion src/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ persistent=yes

# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.10
py-version=3.11

# Discover python modules and packages in the file system subtree.
recursive=no
Expand Down
229 changes: 205 additions & 24 deletions src/hats/io/parquet_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

from __future__ import annotations

import io
import random
from pathlib import Path

import nested_pandas as npd
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as pds
import pyarrow.parquet as pq
from astropy.io.votable.tree import FieldRef, Group, Param, VOTableFile
from astropy.table import Table
from upath import UPath

from hats.io import file_io, paths
Expand Down Expand Up @@ -130,29 +133,6 @@ def write_parquet_metadata(
return total_rows


def read_row_group_fragments(metadata_file: str):
"""Generator for metadata fragment row groups in a parquet metadata file.

Parameters
----------
metadata_file : str
path to `_metadata` file.

Yields
------
RowGroupFragment
metadata for individual row groups
"""
metadata_file = get_upath(metadata_file)
if not file_io.is_regular_file(metadata_file):
metadata_file = paths.get_parquet_metadata_pointer(metadata_file)

dataset = pds.parquet_dataset(metadata_file, filesystem=metadata_file.fs)

for frag in dataset.get_fragments():
yield from frag.row_groups


def _nonemin(value1, value2):
"""Similar to numpy's nanmin, but excludes `None` values.

Expand Down Expand Up @@ -489,3 +469,204 @@ def per_pixel_statistics(
{stat_name: int for stat_name in int_col_names}
)
return frame


def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | None:
"""Determines the appropriate file to read for parquet metadata
stored in the _common_metadata or _metadata files.

Parameters
----------
catalog_base_dir : str | Path | UPath
base path for the catalog

Returns
-------
UPath | None
path to a parquet file containing metadata schema.
"""
common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
if not (common_metadata_exists or metadata_exists):
return None
return common_metadata_file if common_metadata_exists else metadata_file


# pylint: disable=protected-access
def pa_schema_to_vo_schema(
catalog_base_dir: str | Path | UPath,
*,
verbose: bool = False,
field_units: dict | None = None,
field_ucds: dict | None = None,
field_descriptions: dict | None = None,
field_utypes: dict | None = None,
):
"""Create VOTableFile metadata, based on the names and types of fields in the parquet files.

Add ancillary attributes to fields where they are provided in the optional dictionaries.

Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).

Parameters
----------
catalog_base_dir : str | Path | UPath
base path for the catalog
verbose: bool
Should we print out additional debugging statements about the vo metadata?
field_units: dict | None
dictionary mapping column names to astropy units (or string representation of units)
field_ucds: dict | None
dictionary mapping column names to UCDs (Uniform Content Descriptors)
field_descriptions: dict | None
dictionary mapping column names to free-text descriptions
field_utypes: dict | None
dictionary mapping column names to utypes

Returns
-------
VOTableFile
VO object containing all relevant metadata (but no data)
"""
schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir)
if not schema_file:
return None

field_units = field_units or {}
field_ucds = field_ucds or {}
field_descriptions = field_descriptions or {}
field_utypes = field_utypes or {}

## Try to find VO metadata in the file:
# metadata = file_io.read_parquet_metadata(schema_file)
nested_schema = npd.read_parquet(schema_file)

df_types = nested_schema.to_pandas().dtypes
names = []
data_types = []
for col in nested_schema.base_columns:
names.append(col)
type_str = str(df_types[col]).split("[", maxsplit=1)[0]
data_types.append(type_str)

for col in nested_schema.nested_columns:
for key, val in nested_schema[col].dtype.column_dtypes.items():
names.append(f"{col}.{key}")
data_types.append(str(val))
data_types = ["U" if t == "string" else t for t in data_types]

# Might have extra content for nested columns.
named_descriptions = {key: field_descriptions[key] for key in field_descriptions if key in names}
named_units = {key: field_units[key] for key in field_units if key in names}
if verbose:
dropped_keys_units = set(field_units.keys()) - set(named_units.keys())
dropped_keys_desc = set(field_descriptions.keys()) - set(named_descriptions.keys())
if dropped_keys_units or dropped_keys_desc:
print("================== Extra Fields ==================")
if dropped_keys_units:
print(f"warning - dropping some units ({len(dropped_keys_units)}):")
print(dropped_keys_units)
if dropped_keys_desc:
print(f"warning - dropping some descriptions ({len(dropped_keys_desc)}):")
print(dropped_keys_desc)

t = Table(names=names, dtype=data_types, units=named_units, descriptions=named_descriptions)

votablefile = VOTableFile()
votablefile = votablefile.from_table(t)

## TODO - add info to root resource, e.g. obsregime.

## Add groups for nested columns
vo_table = votablefile.get_first_table()
for col in nested_schema.nested_columns:
new_group = Group(vo_table, name=col, config=vo_table._config, pos=vo_table._pos)
if col in field_descriptions:
new_group.description = field_descriptions[col]
else:
new_group.description = "multi-column nested format"
vo_table.groups.append(new_group)

new_param = Param(vo_table, name="is_nested_column", datatype="boolean", value="t")
new_group.entries.append(new_param)

for key in nested_schema[col].columns:
new_field = FieldRef(vo_table, ref=f"{col}.{key}")
new_group.entries.append(new_field)

## Go back and add UCD/utypes to fields
for field in vo_table.iter_fields_and_params():
field_name = field.name
if field_name in field_ucds:
field.ucd = field_ucds[field_name]
if field_name in field_utypes:
field.utype = field_utypes[field_name]
return votablefile


def write_voparquet_in_common_metadata(
catalog_base_dir: str | Path | UPath,
*,
verbose: bool = False,
field_units: dict | None = None,
field_ucds: dict | None = None,
field_descriptions: dict | None = None,
field_utypes: dict | None = None,
):
"""Create VOTableFile metadata, based on the names and types of fields in the parquet files,
and write to a ``catalog_base_dir/dataset/_common_metadata`` parquet file.

Add ancillary attributes to fields where they are provided in the optional dictionaries.

Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).

Parameters
----------
catalog_base_dir : str | Path | UPath
base path for the catalog
verbose: bool
Should we print out additional debugging statements about the vo metadata?
field_units: dict | None
dictionary mapping column names to astropy units (or string representation of units)
field_ucds: dict | None
dictionary mapping column names to UCDs (Uniform Content Descriptors)
field_descriptions: dict | None
dictionary mapping column names to free-text descriptions
field_utypes: dict | None
dictionary mapping column names to utypes
"""
votablefile = pa_schema_to_vo_schema(
verbose=verbose,
catalog_base_dir=catalog_base_dir,
field_units=field_units,
field_ucds=field_ucds,
field_descriptions=field_descriptions,
field_utypes=field_utypes,
)

xml_bstr = io.BytesIO()
votablefile.to_xml(xml_bstr)
xml_str = xml_bstr.getvalue().decode("utf-8")
if verbose:
print("================== Table XML ==================")
print(xml_str)

common_metadata_file_pointer = paths.get_common_metadata_pointer(catalog_base_dir)

pa_schema = file_io.read_parquet_metadata(common_metadata_file_pointer).schema.to_arrow_schema()

original_metadata = pa_schema.metadata or {}
updated_metadata = original_metadata | {
b"IVOA.VOTable-Parquet.version": b"1.0",
b"IVOA.VOTable-Parquet.content": xml_str,
}

pa_schema = pa_schema.with_metadata(updated_metadata)

file_io.write_parquet_metadata(pa_schema, common_metadata_file_pointer)
Loading
Loading