11import importlib .resources
2+ from itertools import starmap
23from pathlib import Path
34from typing import Literal
45
56import human_readable
67import jinja2
78import nested_pandas as npd
9+ import numpy as np
810import pandas as pd
911from upath import UPath
1012
1315from hats .catalog .healpix_dataset .healpix_dataset import HealpixDataset
1416from hats .io import get_common_metadata_pointer , get_partition_info_pointer , templates
1517from hats .io .file_io import get_upath , read_parquet_file_to_pandas
18+ from hats .io .paths import get_data_thumbnail_pointer
1619from hats .loaders .read_hats import read_hats
1720
1821
@@ -178,17 +181,24 @@ def generate_markdown_collection_summary(
178181 else :
179182 empty_nf = None
180183
181- has_nested_columns = False if empty_nf is None else len (empty_nf .nested_columns ) > 0
182-
183184 metadata_table = _gen_md_metadata_table (
184185 catalog , total_columns = None if empty_nf is None else empty_nf .shape [1 ]
185186 )
186187
187- column_table = (
188- pd .DataFrame ()
189- if empty_nf is None
190- else _gen_md_column_table (empty_nf , cat_props .default_columns or [])
191- )
188+ column_table = _gen_md_column_table (catalog , empty_nf )
189+
190+ if "example" in column_table :
191+ ra = np .round (float (column_table .loc [cat_props .ra_column ]["example" ]))
192+ if ra >= 360.0 :
193+ ra -= 360.0
194+ dec = np .round (float (column_table .loc [cat_props .dec_column ]["example" ]))
195+ if dec >= 90.0 :
196+ dec = 89.9
197+ if dec <= - 90.0 :
198+ dec = - 89.9
199+ cone_code_example = {"ra" : ra , "dec" : dec }
200+ else :
201+ cone_code_example = None
192202
193203 return template .render (
194204 name = name ,
@@ -197,11 +207,11 @@ def generate_markdown_collection_summary(
197207 cat_props = cat_props ,
198208 uris = uris ,
199209 has_partition_info = has_partition_info ,
210+ has_default_columns = bool (cat_props .default_columns ),
211+ cone_code_example = cone_code_example ,
200212 margin_thresholds = margin_thresholds ,
201213 uri = uri ,
202214 huggingface_metadata = huggingface_metadata ,
203- has_default_columns = cat_props .default_columns is not None ,
204- has_nested_columns = has_nested_columns ,
205215 metadata_table = metadata_table ,
206216 column_table = column_table ,
207217 )
@@ -230,28 +240,158 @@ def _gen_md_metadata_table(catalog: HealpixDataset, total_columns: int | None) -
230240 return metadata_table
231241
232242
233- def _gen_md_column_table (nf : npd .NestedFrame , default_columns : list [str ]) -> pd .DataFrame :
234- default_columns = frozenset (default_columns )
243+ def _fmt_count_percent (n : int , total : int ) -> str :
244+ if n == 0 :
245+ return "0"
246+ percent = round (n / total * 100 , 2 )
247+ if percent < 0.01 :
248+ return f"{ n :,} (<0.01%)"
249+ return f"{ n :,} ({ percent } %)"
250+
251+
252+ def _hard_truncate (s : str , limit : int ) -> str :
253+ if len (s ) <= limit :
254+ return s
255+ return s [: limit - 1 ] + "…"
256+
257+
258+ def _format_example_value (
259+ value , * , float_precision : int = 4 , soft_limit : int = 50 , hard_limit : int = 70
260+ ) -> str :
261+ """Format an example value for display in a summary table.
262+
263+ Floats are rounded to a limited number of significant figures.
264+ Lists are shown with as many items as fit within ``soft_limit``
265+ characters (always at least one), with a ``(N total)`` suffix when
266+ truncated. Any resulting string longer than ``hard_limit`` is
267+ truncated with ``…``.
268+ """
269+ if value is None :
270+ return "*NULL*"
271+
272+ if isinstance (value , (float , np .floating )):
273+ if np .isnan (value ):
274+ return "*NaN*"
275+ if np .isinf (value ):
276+ return "-∞" if value < 0 else "∞"
277+ return f"{ value :.{float_precision }g} "
278+
279+ if isinstance (value , (list , tuple , np .ndarray )):
280+ items = list (value )
281+ if len (items ) == 0 :
282+ return "[]"
283+ fmt_kwargs = {"float_precision" : float_precision , "soft_limit" : soft_limit , "hard_limit" : hard_limit }
284+ suffix = f", … ({ len (items )} total)]"
285+ # Always include at least one item
286+ parts = [_format_example_value (items [0 ], ** fmt_kwargs )]
287+ for item in items [1 :]:
288+ candidate = _format_example_value (item , ** fmt_kwargs )
289+ # Check if adding this item would exceed the soft limit,
290+ # accounting for the truncation suffix
291+ preview = "[" + ", " .join (parts + [candidate ]) + suffix
292+ if len (preview ) > soft_limit :
293+ break
294+ parts .append (candidate )
295+ if len (parts ) < len (items ):
296+ result = "[" + ", " .join (parts ) + suffix
297+ else :
298+ result = "[" + ", " .join (parts ) + "]"
299+ else :
300+ result = str (value )
301+
302+ return _hard_truncate (result , hard_limit )
303+
304+
305+ def _build_column_table (
306+ nf : npd .NestedFrame , default_columns , fmt_value = _format_example_value
307+ ) -> pd .DataFrame :
308+ """Build column info table from a NestedFrame and default column names."""
309+ default_columns = frozenset (default_columns or [])
310+ has_nested_columns = len (nf .nested_columns ) > 0
311+ has_example_row = not nf .empty
235312
236313 column = []
237314 dtype = []
238- default = []
239- nested_into = []
315+ default = [] if len (default_columns ) > 0 else None
316+ nested_into = [] if has_nested_columns else None
317+ example = [] if has_example_row else None
240318
241319 for name , dt in nf .dtypes .items ():
320+ cell = None if nf .empty else nf [name ].iloc [0 ]
242321 if isinstance (dt , npd .NestedDtype ):
243322 subcolumns = nf .get_subcolumns (name )
244323 column .extend (subcolumns )
245324 dtype .extend (f"list[{ nf [sc ].dtype .pyarrow_dtype } ]" for sc in subcolumns )
246- default .extend (name in default_columns or sc in default_columns for sc in subcolumns )
325+ if default is not None :
326+ default .extend (name in default_columns or sc in default_columns for sc in subcolumns )
247327 nested_into .extend ([name ] * len (subcolumns ))
328+ example .extend (fmt_value (series .to_list ()) for _ , series in cell .items ())
248329 else :
249330 column .append (name )
250331 dtype .append (str (dt .pyarrow_dtype ))
251- nested_into .append (None )
252- default .append (name in default_columns )
332+ if default is not None :
333+ default .append (name in default_columns )
334+ if nested_into is not None :
335+ nested_into .append (None )
336+ if example is not None :
337+ example .append (fmt_value (cell ))
338+
339+ index = pd .Index (column , name = "column" )
340+ result = pd .DataFrame (
341+ {
342+ "dtype" : pd .Series (dtype , dtype = str , index = index ),
343+ },
344+ index = index ,
345+ )
346+ if default is not None :
347+ result ["default" ] = pd .Series (default , dtype = bool , index = index )
348+ if nested_into is not None :
349+ result ["nested_into" ] = pd .Series (nested_into , dtype = str , index = index )
350+ if example is not None :
351+ result ["example" ] = pd .Series (example , dtype = object , index = index )
352+
353+ return result
354+
253355
254- return pd .DataFrame ({"column" : column , "dtype" : dtype , "default" : default , "nested_into" : nested_into })
356+ def _gen_md_column_table (
357+ catalog : HealpixDataset , empty_nf : npd .NestedFrame | None , fmt_value = _format_example_value
358+ ) -> pd .DataFrame :
359+ props = catalog .catalog_info
360+
361+ nf = _get_example_row (catalog )
362+ if nf is None :
363+ if empty_nf is None :
364+ return pd .DataFrame ()
365+ nf = empty_nf
366+
367+ result = _build_column_table (nf , props .default_columns , fmt_value )
368+
369+ stats = catalog .aggregate_column_statistics (exclude_hats_columns = False )
370+ if stats .empty :
371+ return result
372+
373+ index = result .index
374+ missed_columns = list (set (index ) - set (stats .index ))
375+
376+ def _fill_missed (series ):
377+ for col in missed_columns :
378+ series .loc [col ] = "*N/A*"
379+ return series
380+
381+ result ["min_value" ] = _fill_missed (stats ["min_value" ].map (fmt_value ))
382+ result ["max_value" ] = _fill_missed (stats ["max_value" ].map (fmt_value ))
383+
384+ row_count = stats ["row_count" ]
385+ if np .any (row_count != props .total_rows ):
386+ result ["rows" ] = _fill_missed (row_count .map (lambda n : f"{ n :,} " ))
387+ if stats ["null_count" ].sum () > 0 :
388+ null_count = stats ["null_count" ]
389+ nulls = pd .Series (
390+ list (starmap (_fmt_count_percent , zip (null_count , row_count ))), dtype = str , index = stats .index
391+ )
392+ result ["nulls" ] = _fill_missed (nulls )
393+
394+ return result
255395
256396
257397def _join_catalog_uri (col_upath : str | None , path : str ) -> str :
@@ -309,3 +449,30 @@ def _catalog_uris(properties: CollectionProperties, uri: str | None) -> dict[str
309449 for column in index_columns
310450 ],
311451 }
452+
453+
454+ def _get_example_frame (catalog : HealpixDataset , rng : np .random .Generator ) -> npd .NestedFrame | None :
455+ if (root := catalog .catalog_path ) is None or not root .exists ():
456+ return None
457+
458+ if (thumbnail_path := get_data_thumbnail_pointer (root )).exists ():
459+ return read_parquet_file_to_pandas (thumbnail_path , is_dir = False )
460+
461+ healpix_pixels = catalog .get_healpix_pixels ()
462+ pixel = rng .choice (healpix_pixels )
463+ return catalog .read_pixel_to_pandas (pixel )
464+
465+
466+ def _get_example_row (catalog : HealpixDataset ) -> npd .NestedFrame | None :
467+ """Returns a single-row nested frame with a random example row."""
468+ # We want it to be pseudo-random but reproducible
469+ random_seed = 42
470+ rng = np .random .Generator (np .random .PCG64 (random_seed ))
471+
472+ example_nf = _get_example_frame (catalog , rng )
473+
474+ if example_nf is None :
475+ return None
476+
477+ idx = rng .integers (len (example_nf ))
478+ return example_nf .iloc [idx : idx + 1 ]
0 commit comments