Skip to content

Commit b6cebe1

Browse files
authored
Merge pull request #637 from astronomy-commons/gen-readme-columns
Update catalog README.md generation: row example and cone search code example
2 parents 9997828 + 9733abc commit b6cebe1

File tree

3 files changed

+279
-33
lines changed

3 files changed

+279
-33
lines changed

src/hats/io/summary_file.py

Lines changed: 184 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import importlib.resources
2+
from itertools import starmap
23
from pathlib import Path
34
from typing import Literal
45

56
import human_readable
67
import jinja2
78
import nested_pandas as npd
9+
import numpy as np
810
import pandas as pd
911
from upath import UPath
1012

@@ -13,6 +15,7 @@
1315
from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset
1416
from hats.io import get_common_metadata_pointer, get_partition_info_pointer, templates
1517
from hats.io.file_io import get_upath, read_parquet_file_to_pandas
18+
from hats.io.paths import get_data_thumbnail_pointer
1619
from hats.loaders.read_hats import read_hats
1720

1821

@@ -178,17 +181,24 @@ def generate_markdown_collection_summary(
178181
else:
179182
empty_nf = None
180183

181-
has_nested_columns = False if empty_nf is None else len(empty_nf.nested_columns) > 0
182-
183184
metadata_table = _gen_md_metadata_table(
184185
catalog, total_columns=None if empty_nf is None else empty_nf.shape[1]
185186
)
186187

187-
column_table = (
188-
pd.DataFrame()
189-
if empty_nf is None
190-
else _gen_md_column_table(empty_nf, cat_props.default_columns or [])
191-
)
188+
column_table = _gen_md_column_table(catalog, empty_nf)
189+
190+
if "example" in column_table:
191+
ra = np.round(float(column_table.loc[cat_props.ra_column]["example"]))
192+
if ra >= 360.0:
193+
ra -= 360.0
194+
dec = np.round(float(column_table.loc[cat_props.dec_column]["example"]))
195+
if dec >= 90.0:
196+
dec = 89.9
197+
if dec <= -90.0:
198+
dec = -89.9
199+
cone_code_example = {"ra": ra, "dec": dec}
200+
else:
201+
cone_code_example = None
192202

193203
return template.render(
194204
name=name,
@@ -197,11 +207,11 @@ def generate_markdown_collection_summary(
197207
cat_props=cat_props,
198208
uris=uris,
199209
has_partition_info=has_partition_info,
210+
has_default_columns=bool(cat_props.default_columns),
211+
cone_code_example=cone_code_example,
200212
margin_thresholds=margin_thresholds,
201213
uri=uri,
202214
huggingface_metadata=huggingface_metadata,
203-
has_default_columns=cat_props.default_columns is not None,
204-
has_nested_columns=has_nested_columns,
205215
metadata_table=metadata_table,
206216
column_table=column_table,
207217
)
@@ -230,28 +240,158 @@ def _gen_md_metadata_table(catalog: HealpixDataset, total_columns: int | None) -
230240
return metadata_table
231241

232242

233-
def _gen_md_column_table(nf: npd.NestedFrame, default_columns: list[str]) -> pd.DataFrame:
234-
default_columns = frozenset(default_columns)
243+
def _fmt_count_percent(n: int, total: int) -> str:
244+
if n == 0:
245+
return "0"
246+
percent = round(n / total * 100, 2)
247+
if percent < 0.01:
248+
return f"{n:,} (<0.01%)"
249+
return f"{n:,} ({percent}%)"
250+
251+
252+
def _hard_truncate(s: str, limit: int) -> str:
253+
if len(s) <= limit:
254+
return s
255+
return s[: limit - 1] + "…"
256+
257+
258+
def _format_example_value(
259+
value, *, float_precision: int = 4, soft_limit: int = 50, hard_limit: int = 70
260+
) -> str:
261+
"""Format an example value for display in a summary table.
262+
263+
Floats are rounded to a limited number of significant figures.
264+
Lists are shown with as many items as fit within ``soft_limit``
265+
characters (always at least one), with a ``(N total)`` suffix when
266+
truncated. Any resulting string longer than ``hard_limit`` is
267+
truncated with ``…``.
268+
"""
269+
if value is None:
270+
return "*NULL*"
271+
272+
if isinstance(value, (float, np.floating)):
273+
if np.isnan(value):
274+
return "*NaN*"
275+
if np.isinf(value):
276+
return "-∞" if value < 0 else "∞"
277+
return f"{value:.{float_precision}g}"
278+
279+
if isinstance(value, (list, tuple, np.ndarray)):
280+
items = list(value)
281+
if len(items) == 0:
282+
return "[]"
283+
fmt_kwargs = {"float_precision": float_precision, "soft_limit": soft_limit, "hard_limit": hard_limit}
284+
suffix = f", … ({len(items)} total)]"
285+
# Always include at least one item
286+
parts = [_format_example_value(items[0], **fmt_kwargs)]
287+
for item in items[1:]:
288+
candidate = _format_example_value(item, **fmt_kwargs)
289+
# Check if adding this item would exceed the soft limit,
290+
# accounting for the truncation suffix
291+
preview = "[" + ", ".join(parts + [candidate]) + suffix
292+
if len(preview) > soft_limit:
293+
break
294+
parts.append(candidate)
295+
if len(parts) < len(items):
296+
result = "[" + ", ".join(parts) + suffix
297+
else:
298+
result = "[" + ", ".join(parts) + "]"
299+
else:
300+
result = str(value)
301+
302+
return _hard_truncate(result, hard_limit)
303+
304+
305+
def _build_column_table(
306+
nf: npd.NestedFrame, default_columns, fmt_value=_format_example_value
307+
) -> pd.DataFrame:
308+
"""Build column info table from a NestedFrame and default column names."""
309+
default_columns = frozenset(default_columns or [])
310+
has_nested_columns = len(nf.nested_columns) > 0
311+
has_example_row = not nf.empty
235312

236313
column = []
237314
dtype = []
238-
default = []
239-
nested_into = []
315+
default = [] if len(default_columns) > 0 else None
316+
nested_into = [] if has_nested_columns else None
317+
example = [] if has_example_row else None
240318

241319
for name, dt in nf.dtypes.items():
320+
cell = None if nf.empty else nf[name].iloc[0]
242321
if isinstance(dt, npd.NestedDtype):
243322
subcolumns = nf.get_subcolumns(name)
244323
column.extend(subcolumns)
245324
dtype.extend(f"list[{nf[sc].dtype.pyarrow_dtype}]" for sc in subcolumns)
246-
default.extend(name in default_columns or sc in default_columns for sc in subcolumns)
325+
if default is not None:
326+
default.extend(name in default_columns or sc in default_columns for sc in subcolumns)
247327
nested_into.extend([name] * len(subcolumns))
328+
example.extend(fmt_value(series.to_list()) for _, series in cell.items())
248329
else:
249330
column.append(name)
250331
dtype.append(str(dt.pyarrow_dtype))
251-
nested_into.append(None)
252-
default.append(name in default_columns)
332+
if default is not None:
333+
default.append(name in default_columns)
334+
if nested_into is not None:
335+
nested_into.append(None)
336+
if example is not None:
337+
example.append(fmt_value(cell))
338+
339+
index = pd.Index(column, name="column")
340+
result = pd.DataFrame(
341+
{
342+
"dtype": pd.Series(dtype, dtype=str, index=index),
343+
},
344+
index=index,
345+
)
346+
if default is not None:
347+
result["default"] = pd.Series(default, dtype=bool, index=index)
348+
if nested_into is not None:
349+
result["nested_into"] = pd.Series(nested_into, dtype=str, index=index)
350+
if example is not None:
351+
result["example"] = pd.Series(example, dtype=object, index=index)
352+
353+
return result
354+
253355

254-
return pd.DataFrame({"column": column, "dtype": dtype, "default": default, "nested_into": nested_into})
356+
def _gen_md_column_table(
357+
catalog: HealpixDataset, empty_nf: npd.NestedFrame | None, fmt_value=_format_example_value
358+
) -> pd.DataFrame:
359+
props = catalog.catalog_info
360+
361+
nf = _get_example_row(catalog)
362+
if nf is None:
363+
if empty_nf is None:
364+
return pd.DataFrame()
365+
nf = empty_nf
366+
367+
result = _build_column_table(nf, props.default_columns, fmt_value)
368+
369+
stats = catalog.aggregate_column_statistics(exclude_hats_columns=False)
370+
if stats.empty:
371+
return result
372+
373+
index = result.index
374+
missed_columns = list(set(index) - set(stats.index))
375+
376+
def _fill_missed(series):
377+
for col in missed_columns:
378+
series.loc[col] = "*N/A*"
379+
return series
380+
381+
result["min_value"] = _fill_missed(stats["min_value"].map(fmt_value))
382+
result["max_value"] = _fill_missed(stats["max_value"].map(fmt_value))
383+
384+
row_count = stats["row_count"]
385+
if np.any(row_count != props.total_rows):
386+
result["rows"] = _fill_missed(row_count.map(lambda n: f"{n:,}"))
387+
if stats["null_count"].sum() > 0:
388+
null_count = stats["null_count"]
389+
nulls = pd.Series(
390+
list(starmap(_fmt_count_percent, zip(null_count, row_count))), dtype=str, index=stats.index
391+
)
392+
result["nulls"] = _fill_missed(nulls)
393+
394+
return result
255395

256396

257397
def _join_catalog_uri(col_upath: str | None, path: str) -> str:
@@ -309,3 +449,30 @@ def _catalog_uris(properties: CollectionProperties, uri: str | None) -> dict[str
309449
for column in index_columns
310450
],
311451
}
452+
453+
454+
def _get_example_frame(catalog: HealpixDataset, rng: np.random.Generator) -> npd.NestedFrame | None:
455+
if (root := catalog.catalog_path) is None or not root.exists():
456+
return None
457+
458+
if (thumbnail_path := get_data_thumbnail_pointer(root)).exists():
459+
return read_parquet_file_to_pandas(thumbnail_path, is_dir=False)
460+
461+
healpix_pixels = catalog.get_healpix_pixels()
462+
pixel = rng.choice(healpix_pixels)
463+
return catalog.read_pixel_to_pandas(pixel)
464+
465+
466+
def _get_example_row(catalog: HealpixDataset) -> npd.NestedFrame | None:
467+
"""Returns a single-row nested frame with a random example row."""
468+
# We want it to be pseudo-random but reproducible
469+
random_seed = 42
470+
rng = np.random.Generator(np.random.PCG64(random_seed))
471+
472+
example_nf = _get_example_frame(catalog, rng)
473+
474+
if example_nf is None:
475+
return None
476+
477+
idx = rng.integers(len(example_nf))
478+
return example_nf.iloc[idx : idx + 1]

src/hats/io/templates/default_md_template.jinja2

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,26 @@ tags:
2222

2323
### Access the catalog
2424

25-
We recommended the use of the [LSDB](https://lsdb.io) Python framework to access HATS catalogs.
25+
We recommend the use of the [LSDB](https://lsdb.io) Python framework to access HATS catalogs.
2626
LSDB can be installed via `pip install lsdb` or `conda install conda-forge::lsdb`,
2727
see more details [in the docs](https://docs.lsdb.io/).
2828
The following code provides a minimal example of opening this catalog:
2929

3030
```python
3131
import lsdb
3232

33-
catalog = lsdb.open_catalog("{{uris["collection"]}}")
33+
# Full sky coverage{% if has_default_columns %}, default columns only{% endif %}.
34+
catalog = lsdb.open_catalog("{{ uris["collection"] }}")
35+
{%- if cone_code_example %}
36+
# One-degree cone{% if has_default_columns %}, all columns{% endif %}.
37+
catalog = lsdb.open_catalog(
38+
"{{ uris["collection"] }}",
39+
search_filter=lsdb.ConeSearch(ra={{ cone_code_example["ra"] }}, dec={{ cone_code_example["dec"] }}, radius_arcsec=3600.0),
40+
{%- if has_default_columns %}
41+
columns="all",
42+
{%- endif %}
43+
)
44+
{%- endif %}
3445
```
3546

3647
Each catalog in this collection is represented as a separate [Apache Parquet dataset](https://arrow.apache.org/docs/python/dataset.html) and can be accessed with a variety of tools, including `pandas`, `pyarrow`, `dask`, `Spark`, `DuckDB`.
@@ -50,7 +61,7 @@ This catalog is represented by the following files and directories:
5061
{%- if cat_props.skymap_order %}
5162
- [`skymap.fits`]({{uris["primary"]["uri"]}}/skymap.fits) — HEALPix skymap FITS file with row-counts per HEALPix tile of fixed order {{cat_props.skymap_order}}
5263
{%- for alt_order in (cat_props.skymap_alt_orders or []) %}
53-
- [`skymap.{{alt_order}}.fits`]({{uris["primary"]["uri"]}}/skymap.{{alt_order}}.fits) — Same, but for order {{alt_order}}
64+
- [`skymap.{{alt_order}}.fits`]({{uris["primary"]["uri"]}}/skymap.{{alt_order}}.fits) — HEALPix skymap FITS file at order {{alt_order}}
5465
{%- endfor %}
5566
{%- endif %}
5667
{%- for margin in uris["margins"] %}
@@ -75,22 +86,41 @@ Metadata of the main HATS catalog, excluding margins and indexes:
7586

7687
The main HATS catalog contains the following columns:
7788

78-
| **Name** | {% for col in column_table.column %} **`{{col}}`** |{% endfor %}
79-
| --- | {% for _ in column_table.column %} --- |{% endfor %}
80-
| **Data Type** | {% for value in column_table.dtype %} `{{value}}` |{% endfor %}
81-
{%- if has_default_columns %}
82-
| **Default?** | {%- for value in column_table.default %} {{ "Yes" if value else "No" }} |{%- endfor %}
89+
| **Name** | {% for col in column_table.index %} **`{{col}}`** |{% endfor %}
90+
| --- | {% for _ in column_table.index %} --- |{% endfor %}
91+
| **Data Type** | {% for value in column_table.dtype %} {{value}} |{% endfor %}
92+
{%- if "default" in column_table %}
93+
| **Default?** | {% for value in column_table.default %} {{ "Yes" if value else "No" }} |{% endfor %}
94+
{%- endif %}
95+
{%- if "nested_into" in column_table %}
96+
| **Nested?** | {% for value in column_table.nested_into %} {{ value or "—" }} |{% endfor %}
97+
{%- endif %}
98+
{%- if "rows" in column_table %}
99+
| **Value count** | {% for value in column_table.rows %} {{ value }} |{% endfor %}
100+
{%- endif %}
101+
{%- if "nulls" in column_table %}
102+
| **Null count** | {% for value in column_table.nulls %} {{ value }} |{% endfor %}
83103
{%- endif %}
84-
{% if has_nested_columns -%}
85-
| **Nested?** | {%- for value in column_table.nested_into %} {{ value or "-" }} |{%- endfor %}
104+
{%- if "example" in column_table %}
105+
| **Example row** | {% for value in column_table.example %} {{ value }} |{% endfor %}
86106
{%- endif %}
87-
{%if has_default_columns %}
107+
{%- if "min_value" in column_table %}
108+
| **Minimum value** | {% for value in column_table.min_value %} {{ value }} |{% endfor %}
109+
{%- endif %}
110+
{%- if "max_value" in column_table %}
111+
| **Maximum value** | {% for value in column_table.max_value %} {{ value }} |{% endfor %}
112+
{%- endif %}
113+
{% if "default" in column_table %}
88114
"Default" indicates whether the column is included when loading the catalog with `lsdb.open_catalog()`
89115
without specifying any columns to load.
90116
The list of default columns is available in the [`hats.properties`]({{uris["primary"]["uri"]}}/hats.properties) file.
91117
{% endif %}
92-
{%if has_nested_columns %}
118+
{% if "nested_into" in column_table %}
93119
"Nested" indicates whether the column is stored as a nested field inside another "struct" column.
94120
{% endif %}
121+
{% if "rows" in column_table %}
122+
"Value count" may be different from the total number of rows for nested columns: each nested element is counted as a single value.
123+
{% if "nulls" in column_table %} "Null count" also refers to "elementary" values, not to rows. {% endif %}
124+
{% endif %}
95125

96126
{% endif %}

0 commit comments

Comments
 (0)