astronomy-commons · delucchi-cmu · Nov 3, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/.copier-answers.yml b/.copier-answers.yml
@@ -20,7 +20,6 @@ project_license: BSD
 project_name: hats
 project_organization: astronomy-commons
 python_versions:
-- '3.10'
 - '3.11'
 - '3.12'
 - '3.13'

diff --git a/.github/workflows/asv-main.yml b/.github/workflows/asv-main.yml
@@ -8,7 +8,7 @@ on:
     branches: [ main ]
 
 env:
-  PYTHON_VERSION: "3.11"
+  PYTHON_VERSION: "3.12"
   ASV_VERSION: "0.6.5"
   WORKING_DIR: ${{github.workspace}}/benchmarks
 

diff --git a/.github/workflows/asv-nightly.yml b/.github/workflows/asv-nightly.yml
@@ -9,7 +9,7 @@ on:
   workflow_dispatch:
 
 env:
-  PYTHON_VERSION: "3.11"
+  PYTHON_VERSION: "3.12"
   ASV_VERSION: "0.6.5"
   WORKING_DIR: ${{github.workspace}}/benchmarks
   NIGHTLY_HASH_FILE: nightly-hash

diff --git a/.github/workflows/asv-pr.yml b/.github/workflows/asv-pr.yml
@@ -15,7 +15,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: "3.11"
+  PYTHON_VERSION: "3.12"
   ASV_VERSION: "0.6.5"
   WORKING_DIR: ${{github.workspace}}/benchmarks
   ARTIFACTS_DIR: ${{github.workspace}}/artifacts

diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v6
       with:
-        python-version: '3.11'
+        python-version: '3.12'
     - name: Install uv
       uses: astral-sh/setup-uv@v7
     - name: Install dependencies

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -26,7 +26,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v6
       with:
-        python-version: '3.11'
+        python-version: '3.12'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10', '3.11', '3.12', '3.13']
+        python-version: ['3.11', '3.12', '3.13']
 
     steps:
     - uses: actions/checkout@v5

diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10', '3.11', '3.12', '3.13']
+        python-version: ['3.11', '3.12', '3.13']
 
     steps:
     - uses: actions/checkout@v5
@@ -43,10 +43,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v5
-    - name: Set up Python 3.10
+    - name: Set up Python 3.11
       uses: actions/setup-python@v6
       with:
-        python-version: '3.10'
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         sudo apt-get update

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -94,7 +94,7 @@ repos:
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.11
+        language_version: python3.12
     # Make sure Sphinx can build the documentation while explicitly omitting 
     # notebooks from the docs, so users don't have to wait through the execution 
     # of each notebook or each commit. By default, these will be checked in the 

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -8,7 +8,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.11"
+    python: "3.12"
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
@@ -37,7 +37,7 @@
     // The Pythons you'd like to test against. If not provided, defaults
     // to the current version of Python used to run `asv`.
     "pythons": [
-        "3.11"
+        "3.12"
     ],
     // The matrix of dependencies to test. Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,10 +16,10 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dynamic = ["version"]
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 dependencies = [
     "aiohttp>=3.8.0", # http filesystem support
-    "astropy>=6.1.5",
+    "astropy>=7.0.0",
     "cdshealpix>=0.7.0",
     "fsspec>=2023.10.0", # Used for abstract filesystems
     "jproperties>=2.0.0",
@@ -84,7 +84,7 @@ omit=["src/hats/_version.py"]
 
 [tool.black]
 line-length = 110
-target-version = ["py310"]
+target-version = ["py311"]
 [tool.isort]
 profile = "black"
 line_length = 110

diff --git a/src/.pylintrc b/src/.pylintrc
@@ -87,7 +87,7 @@ persistent=yes
 
 # Minimum Python version to use for version dependent checks. Will default to
 # the version used to run pylint.
-py-version=3.10
+py-version=3.11
 
 # Discover python modules and packages in the file system subtree.
 recursive=no

diff --git a/src/hats/io/parquet_metadata.py b/src/hats/io/parquet_metadata.py
@@ -2,14 +2,17 @@
 
 from __future__ import annotations
 
+import io
 import random
 from pathlib import Path
 
+import nested_pandas as npd
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.dataset as pds
 import pyarrow.parquet as pq
+from astropy.io.votable.tree import FieldRef, Group, Param, VOTableFile
+from astropy.table import Table
 from upath import UPath
 
 from hats.io import file_io, paths
@@ -130,29 +133,6 @@ def write_parquet_metadata(
     return total_rows
 
 
-def read_row_group_fragments(metadata_file: str):
-    """Generator for metadata fragment row groups in a parquet metadata file.
-
-    Parameters
-    ----------
-    metadata_file : str
-        path to `_metadata` file.
-
-    Yields
-    ------
-    RowGroupFragment
-        metadata for individual row groups
-    """
-    metadata_file = get_upath(metadata_file)
-    if not file_io.is_regular_file(metadata_file):
-        metadata_file = paths.get_parquet_metadata_pointer(metadata_file)
-
-    dataset = pds.parquet_dataset(metadata_file, filesystem=metadata_file.fs)
-
-    for frag in dataset.get_fragments():
-        yield from frag.row_groups
-
-
 def _nonemin(value1, value2):
     """Similar to numpy's nanmin, but excludes `None` values.
 
@@ -489,3 +469,204 @@ def per_pixel_statistics(
             {stat_name: int for stat_name in int_col_names}
         )
     return frame
+
+
+def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | None:
+    """Determines the appropriate file to read for parquet metadata
+    stored in the _common_metadata or _metadata files.
+
+    Parameters
+    ----------
+    catalog_base_dir : str | Path | UPath
+        base path for the catalog
+
+    Returns
+    -------
+    UPath | None
+        path to a parquet file containing metadata schema.
+    """
+    common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
+    common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
+    metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
+    metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
+    if not (common_metadata_exists or metadata_exists):
+        return None
+    return common_metadata_file if common_metadata_exists else metadata_file
+
+
+# pylint: disable=protected-access
+def pa_schema_to_vo_schema(
+    catalog_base_dir: str | Path | UPath,
+    *,
+    verbose: bool = False,
+    field_units: dict | None = None,
+    field_ucds: dict | None = None,
+    field_descriptions: dict | None = None,
+    field_utypes: dict | None = None,
+):
+    """Create VOTableFile metadata, based on the names and types of fields in the parquet files.
+
+    Add ancillary attributes to fields where they are provided in the optional dictionaries.
+
+    Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
+    for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
+    attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).
+
+    Parameters
+    ----------
+    catalog_base_dir : str | Path | UPath
+        base path for the catalog
+    verbose: bool
+        Should we print out additional debugging statements about the vo metadata?
+    field_units: dict | None
+        dictionary mapping column names to astropy units (or string representation of units)
+    field_ucds: dict | None
+        dictionary mapping column names to UCDs (Uniform Content Descriptors)
+    field_descriptions: dict | None
+        dictionary mapping column names to free-text descriptions
+    field_utypes: dict | None
+        dictionary mapping column names to utypes
+
+    Returns
+    -------
+    VOTableFile
+        VO object containing all relevant metadata (but no data)
+    """
+    schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir)
+    if not schema_file:
+        return None
+
+    field_units = field_units or {}
+    field_ucds = field_ucds or {}
+    field_descriptions = field_descriptions or {}
+    field_utypes = field_utypes or {}
+
+    ## Try to find VO metadata in the file:
+    # metadata = file_io.read_parquet_metadata(schema_file)
+    nested_schema = npd.read_parquet(schema_file)
+
+    df_types = nested_schema.to_pandas().dtypes
+    names = []
+    data_types = []
+    for col in nested_schema.base_columns:
+        names.append(col)
+        type_str = str(df_types[col]).split("[", maxsplit=1)[0]
+        data_types.append(type_str)
+
+    for col in nested_schema.nested_columns:
+        for key, val in nested_schema[col].dtype.column_dtypes.items():
+            names.append(f"{col}.{key}")
+            data_types.append(str(val))
+    data_types = ["U" if t == "string" else t for t in data_types]
+
+    # Might have extra content for nested columns.
+    named_descriptions = {key: field_descriptions[key] for key in field_descriptions if key in names}
+    named_units = {key: field_units[key] for key in field_units if key in names}
+    if verbose:
+        dropped_keys_units = set(field_units.keys()) - set(named_units.keys())
+        dropped_keys_desc = set(field_descriptions.keys()) - set(named_descriptions.keys())
+        if dropped_keys_units or dropped_keys_desc:
+            print("================== Extra Fields ==================")
+        if dropped_keys_units:
+            print(f"warning - dropping some units ({len(dropped_keys_units)}):")
+            print(dropped_keys_units)
+        if dropped_keys_desc:
+            print(f"warning - dropping some descriptions ({len(dropped_keys_desc)}):")
+            print(dropped_keys_desc)
+
+    t = Table(names=names, dtype=data_types, units=named_units, descriptions=named_descriptions)
+
+    votablefile = VOTableFile()
+    votablefile = votablefile.from_table(t)
+
+    ## TODO - add info to root resource, e.g. obsregime.
+
+    ## Add groups for nested columns
+    vo_table = votablefile.get_first_table()
+    for col in nested_schema.nested_columns:
+        new_group = Group(vo_table, name=col, config=vo_table._config, pos=vo_table._pos)
+        if col in field_descriptions:
+            new_group.description = field_descriptions[col]
+        else:
+            new_group.description = "multi-column nested format"
+        vo_table.groups.append(new_group)
+
+        new_param = Param(vo_table, name="is_nested_column", datatype="boolean", value="t")
+        new_group.entries.append(new_param)
+
+        for key in nested_schema[col].columns:
+            new_field = FieldRef(vo_table, ref=f"{col}.{key}")
+            new_group.entries.append(new_field)
+
+    ## Go back and add UCD/utypes to fields
+    for field in vo_table.iter_fields_and_params():
+        field_name = field.name
+        if field_name in field_ucds:
+            field.ucd = field_ucds[field_name]
+        if field_name in field_utypes:
+            field.utype = field_utypes[field_name]
+    return votablefile
+
+
+def write_voparquet_in_common_metadata(
+    catalog_base_dir: str | Path | UPath,
+    *,
+    verbose: bool = False,
+    field_units: dict | None = None,
+    field_ucds: dict | None = None,
+    field_descriptions: dict | None = None,
+    field_utypes: dict | None = None,
+):
+    """Create VOTableFile metadata, based on the names and types of fields in the parquet files,
+    and write to a ``catalog_base_dir/dataset/_common_metadata`` parquet file.
+
+    Add ancillary attributes to fields where they are provided in the optional dictionaries.
+
+    Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
+    for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
+    attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).
+
+    Parameters
+    ----------
+    catalog_base_dir : str | Path | UPath
+        base path for the catalog
+    verbose: bool
+        Should we print out additional debugging statements about the vo metadata?
+    field_units: dict | None
+        dictionary mapping column names to astropy units (or string representation of units)
+    field_ucds: dict | None
+        dictionary mapping column names to UCDs (Uniform Content Descriptors)
+    field_descriptions: dict | None
+        dictionary mapping column names to free-text descriptions
+    field_utypes: dict | None
+        dictionary mapping column names to utypes
+    """
+    votablefile = pa_schema_to_vo_schema(
+        verbose=verbose,
+        catalog_base_dir=catalog_base_dir,
+        field_units=field_units,
+        field_ucds=field_ucds,
+        field_descriptions=field_descriptions,
+        field_utypes=field_utypes,
+    )
+
+    xml_bstr = io.BytesIO()
+    votablefile.to_xml(xml_bstr)
+    xml_str = xml_bstr.getvalue().decode("utf-8")
+    if verbose:
+        print("================== Table XML ==================")
+        print(xml_str)
+
+    common_metadata_file_pointer = paths.get_common_metadata_pointer(catalog_base_dir)
+
+    pa_schema = file_io.read_parquet_metadata(common_metadata_file_pointer).schema.to_arrow_schema()
+
+    original_metadata = pa_schema.metadata or {}
+    updated_metadata = original_metadata | {
+        b"IVOA.VOTable-Parquet.version": b"1.0",
+        b"IVOA.VOTable-Parquet.content": xml_str,
+    }
+
+    pa_schema = pa_schema.with_metadata(updated_metadata)
+
+    file_io.write_parquet_metadata(pa_schema, common_metadata_file_pointer)
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,7 +20,6 @@ project_license: BSD @@
     project_name: hats
     project_organization: astronomy-commons
     python_versions:
-    - '3.10'
     - '3.11'
     - '3.12'
     - '3.13'
@@ Expand Down @@