VOParquet metadata generation (#612)

delucchi-cmu · web-flow · commit cce3f3ae0989 · 2026-01-12T15:34:02.000-05:00
* VOParquet metadata generation

* Responses to code review comments.
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ dynamic = ["version"]
 requires-python = ">=3.11"
 dependencies = [
     "aiohttp>=3.10.0", # http filesystem support
-    "astropy>=6.1.5",
+    "astropy>=7.0.0",
     "cdshealpix>=0.7.0",
     "fsspec>=2023.10.0", # Used for abstract filesystems
     "jproperties>=2.0.0",
diff --git a/src/hats/io/parquet_metadata.py b/src/hats/io/parquet_metadata.py
@@ -2,14 +2,17 @@
 
 from __future__ import annotations
 
+import io
 import random
 from pathlib import Path
 
+import nested_pandas as npd
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.dataset as pds
 import pyarrow.parquet as pq
+from astropy.io.votable.tree import FieldRef, Group, Param, VOTableFile
+from astropy.table import Table
 from upath import UPath
 
 from hats.io import file_io, paths
@@ -165,29 +168,6 @@ def write_parquet_metadata(
     return total_rows
 
 
-def read_row_group_fragments(metadata_file: str):
-    """Generator for metadata fragment row groups in a parquet metadata file.
-
-    Parameters
-    ----------
-    metadata_file : str
-        path to `_metadata` file.
-
-    Yields
-    ------
-    RowGroupFragment
-        metadata for individual row groups
-    """
-    metadata_file = get_upath(metadata_file)
-    if not file_io.is_regular_file(metadata_file):
-        metadata_file = paths.get_parquet_metadata_pointer(metadata_file)
-
-    dataset = pds.parquet_dataset(metadata_file.path, filesystem=metadata_file.fs)
-
-    for frag in dataset.get_fragments():
-        yield from frag.row_groups
-
-
 def _nonemin(value1, value2):
     """Similar to numpy's nanmin, but excludes `None` values.
 
@@ -529,3 +509,196 @@ def per_pixel_statistics(
             {stat_name: int for stat_name in int_col_names}
         )
     return frame
+
+
+def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | None:
+    """Determines the appropriate file to read for parquet metadata
+    stored in the _common_metadata or _metadata files.
+
+    Parameters
+    ----------
+    catalog_base_dir : str | Path | UPath
+        base path for the catalog
+
+    Returns
+    -------
+    UPath | None
+        path to a parquet file containing metadata schema.
+    """
+    common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
+    common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
+    metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
+    metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
+    if not (common_metadata_exists or metadata_exists):
+        return None
+    return common_metadata_file if common_metadata_exists else metadata_file
+
+
+# pylint: disable=protected-access
+def nested_frame_to_vo_schema(
+    nested_frame: npd.NestedFrame,
+    *,
+    verbose: bool = False,
+    field_units: dict | None = None,
+    field_ucds: dict | None = None,
+    field_descriptions: dict | None = None,
+    field_utypes: dict | None = None,
+):
+    """Create VOTableFile metadata, based on the names and types of fields in the NestedFrame.
+    Add ancillary attributes to fields where they are provided in the optional dictionaries.
+    Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
+    for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
+    attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).
+
+    Parameters
+    ----------
+    nested_frame : npd.NestedFrame
+        nested frame representing catalog data. this can be empty, as we only need to
+        know about the column names and types.
+    verbose: bool
+        Should we print out additional debugging statements about the vo metadata?
+    field_units: dict | None
+        dictionary mapping column names to astropy units (or string representation of units)
+    field_ucds: dict | None
+        dictionary mapping column names to UCDs (Uniform Content Descriptors)
+    field_descriptions: dict | None
+        dictionary mapping column names to free-text descriptions
+    field_utypes: dict | None
+        dictionary mapping column names to utypes
+
+    Returns
+    -------
+    VOTableFile
+        VO object containing all relevant metadata (but no data)
+    """
+    field_units = field_units or {}
+    field_ucds = field_ucds or {}
+    field_descriptions = field_descriptions or {}
+    field_utypes = field_utypes or {}
+
+    # Collate and tidy up the column names and data types.
+    df_types = nested_frame.to_pandas().dtypes
+    names = []
+    data_types = []
+    for col in nested_frame.base_columns:
+        names.append(col)
+        data_types.append(str(df_types[col]))
+
+    for col in nested_frame.nested_columns:
+        for key, val in nested_frame[col].dtype.column_dtypes.items():
+            names.append(f"{col}.{key}")
+            data_types.append(str(val))
+    # astropy.Table uses numpy-style dtypes, and this cleans up type strings.
+    data_types = ["U" if "string" in t else t.removesuffix("[pyarrow]") for t in data_types]
+
+    # Might have extra content for nested columns.
+    named_descriptions = {key: field_descriptions[key] for key in field_descriptions if key in names}
+    named_units = {key: field_units[key] for key in field_units if key in names}
+    if verbose:
+        dropped_keys_units = set(field_units.keys()) - set(named_units.keys())
+        dropped_keys_desc = set(field_descriptions.keys()) - set(named_descriptions.keys())
+        if dropped_keys_units or dropped_keys_desc:
+            print("================== Extra Fields ==================")
+        if dropped_keys_units:
+            print(f"warning - dropping some units ({len(dropped_keys_units)}):")
+            print(dropped_keys_units)
+        if dropped_keys_desc:
+            print(f"warning - dropping some descriptions ({len(dropped_keys_desc)}):")
+            print(dropped_keys_desc)
+
+    t = Table(names=names, dtype=data_types, units=named_units, descriptions=named_descriptions)
+
+    votablefile = VOTableFile()
+    votablefile = votablefile.from_table(t)
+
+    ## TODO - add info to root resource, e.g. obsregime.
+
+    ## Add groups for nested columns
+    vo_table = votablefile.get_first_table()
+    for col in nested_frame.nested_columns:
+        new_group = Group(vo_table, name=col, config=vo_table._config, pos=vo_table._pos)
+        if col in field_descriptions:
+            new_group.description = field_descriptions[col]
+        else:
+            new_group.description = "multi-column nested format"
+        vo_table.groups.append(new_group)
+
+        new_param = Param(vo_table, name="is_nested_column", datatype="boolean", value="t")
+        new_group.entries.append(new_param)
+
+        for key in nested_frame[col].columns:
+            new_field = FieldRef(vo_table, ref=f"{col}.{key}")
+            new_group.entries.append(new_field)
+
+    ## Go back and add UCD/utypes to fields
+    for field in vo_table.iter_fields_and_params():
+        field_name = field.name
+        if field_name in field_ucds:
+            field.ucd = field_ucds[field_name]
+        if field_name in field_utypes:
+            field.utype = field_utypes[field_name]
+    return votablefile
+
+
+def write_voparquet_in_common_metadata(
+    catalog_base_dir: str | Path | UPath,
+    *,
+    verbose: bool = False,
+    field_units: dict | None = None,
+    field_ucds: dict | None = None,
+    field_descriptions: dict | None = None,
+    field_utypes: dict | None = None,
+):
+    """Create VOTableFile metadata, based on the names and types of fields in the parquet files,
+    and write to a ``catalog_base_dir/dataset/_common_metadata`` parquet file.
+    Add ancillary attributes to fields where they are provided in the optional dictionaries.
+    Note on field names with nested columns: to include ancillary attributes (units, ucds, etc)
+    for a nested sub-column, use dot notation (e.g. ``"lightcurve.band"``). You can add ancillary
+    attributes for the entire nested column group using the nested column name (e.g. ``"lightcurve"``).
+
+    Parameters
+    ----------
+    catalog_base_dir : str | Path | UPath
+        base path for the catalog
+    verbose: bool
+        Should we print out additional debugging statements about the vo metadata?
+    field_units: dict | None
+        dictionary mapping column names to astropy units (or string representation of units)
+    field_ucds: dict | None
+        dictionary mapping column names to UCDs (Uniform Content Descriptors)
+    field_descriptions: dict | None
+        dictionary mapping column names to free-text descriptions
+    field_utypes: dict | None
+        dictionary mapping column names to utypes
+    """
+    schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir)
+    if not schema_file:
+        return
+    nested_frame = npd.read_parquet(schema_file)
+    votablefile = nested_frame_to_vo_schema(
+        nested_frame=nested_frame,
+        verbose=verbose,
+        field_units=field_units,
+        field_ucds=field_ucds,
+        field_descriptions=field_descriptions,
+        field_utypes=field_utypes,
+    )
+
+    xml_bstr = io.BytesIO()
+    votablefile.to_xml(xml_bstr)
+    xml_str = xml_bstr.getvalue().decode("utf-8")
+    if verbose:
+        print("================== Table XML ==================")
+        print(xml_str)
+
+    pa_schema = file_io.read_parquet_metadata(schema_file).schema.to_arrow_schema()
+
+    original_metadata = pa_schema.metadata or {}
+    updated_metadata = original_metadata | {
+        b"IVOA.VOTable-Parquet.version": b"1.0",
+        b"IVOA.VOTable-Parquet.content": xml_str,
+    }
+
+    pa_schema = pa_schema.with_metadata(updated_metadata)
+
+    file_io.write_parquet_metadata(pa_schema, schema_file)
diff --git a/src/hats/loaders/read_hats.py b/src/hats/loaders/read_hats.py
@@ -17,6 +17,7 @@
 from hats.catalog.partition_info import PartitionInfo
 from hats.io import file_io, paths
 from hats.io.file_io import read_parquet_metadata
+from hats.io.parquet_metadata import pick_metadata_schema_file
 
 DATASET_TYPE_TO_CLASS = {
     CatalogType.OBJECT: Catalog,
@@ -106,16 +107,12 @@ def _read_moc_from_point_map(catalog_base_dir: str | Path | UPath) -> MOC | None
 
 def _read_schema_from_metadata(catalog_base_dir: str | Path | UPath) -> pa.Schema | None:
     """Reads the schema information stored in the _common_metadata or _metadata files."""
-    common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
-    common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
-    metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
-    metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
-    if not (common_metadata_exists or metadata_exists):
+    schema_file = pick_metadata_schema_file(catalog_base_dir=catalog_base_dir)
+    if not schema_file:
         warnings.warn(
             "_common_metadata or _metadata files not found for this catalog."
             "The arrow schema will not be set."
         )
         return None
-    schema_file = common_metadata_file if common_metadata_exists else metadata_file
     metadata = read_parquet_metadata(schema_file)
     return metadata.schema.to_arrow_schema()
diff --git a/tests/hats/io/test_parquet_metadata.py b/tests/hats/io/test_parquet_metadata.py