Common utilities for size estimation and binning (#613)

delucchi-cmu · olivialynn · web-flow · commit 64673b550dc0 · 2025-12-17T09:46:49.000-05:00
* Common utilities for size estimation and binning

* Test coverage. Fun methods.

* Ooopsies - pylint.

* More testing. Use `sorted` argument of `np.unique`.

* Apply suggestions from code review

Co-authored-by: Olivia Lynn &lt;olynn@andrew.cmu.edu&gt;

* black formatting.

---------

Co-authored-by: Olivia Lynn &lt;olynn@andrew.cmu.edu&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "mocpy>=0.19.0",
     "nested-pandas>=0.4.1",
     "numba>=0.58",
-    "numpy>=2,<3", 
+    "numpy>=2.3,<3", 
     "pandas>=2.0",
     # NOTE: package PINNED at:
     # !=19.0.0 due to https://github.com/astronomy-commons/hats/pull/516
diff --git a/src/hats/catalog/dataset/table_properties.py b/src/hats/catalog/dataset/table_properties.py
@@ -10,7 +10,7 @@
 from upath import UPath
 
 from hats.catalog.catalog_type import CatalogType
-from hats.io import file_io
+from hats.io import file_io, size_estimates
 
 ## catalog_name, catalog_type, and total_rows are required for ALL types
 CATALOG_TYPE_REQUIRED_FIELDS = {
@@ -342,18 +342,6 @@ def new_provenance_dict(
         dict
             A dictionary with properties for the HATS catalog.
         """
-
-        def _estimate_dir_size(target_dir):
-            total_size = 0
-            for item in target_dir.iterdir():
-                if item.is_dir():
-                    total_size += _estimate_dir_size(item)
-                else:
-                    total_size += item.stat().st_size
-            return total_size
-
-        path = file_io.get_upath(path)
-
         builder_str = ""
         if builder is not None:
             builder_str = f"{builder}, "
@@ -363,7 +351,7 @@ def _estimate_dir_size(target_dir):
         now = datetime.now(tz=timezone.utc)
         properties["hats_builder"] = builder_str
         properties["hats_creation_date"] = now.strftime("%Y-%m-%dT%H:%M%Z")
-        properties["hats_estsize"] = int(_estimate_dir_size(path) / 1024) if path else 0
+        properties["hats_estsize"] = size_estimates.estimate_dir_size(path, divisor=1024)
         properties["hats_release_date"] = "2025-08-22"
         properties["hats_version"] = "v1.0"
         return kwargs | properties
diff --git a/src/hats/io/size_estimates.py b/src/hats/io/size_estimates.py
@@ -0,0 +1,100 @@
+"""General utilities for estimating size of input and output."""
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from upath import UPath
+
+from hats.io import file_io
+
+
+def estimate_dir_size(path: str | Path | UPath | None = None, *, divisor=1):
+    """Estimate the disk usage of a directory, and recursive contents.
+
+    When divisor == 1, returns size in bytes."""
+    path = file_io.get_upath(path)
+    if path is None:
+        return 0
+
+    def _estimate_dir_size(target_dir):
+        total_size = 0
+        for item in target_dir.iterdir():
+            if item.is_dir():
+                total_size += _estimate_dir_size(item)
+            else:
+                total_size += item.stat().st_size
+        return total_size
+
+    est_size = _estimate_dir_size(path)
+    if divisor > 1:
+        return int(est_size / divisor)
+    return est_size
+
+
+def _get_row_mem_size_data_frame(row):
+    """Given a pandas dataframe row (as a tuple), return the memory size of that row.
+
+    Args:
+        row (tuple): the row from the dataframe
+
+    Returns:
+        int: the memory size of the row in bytes
+    """
+    total = 0
+
+    # Add the memory overhead of the row object itself.
+    total += sys.getsizeof(row)
+
+    # Then add the size of each item in the row.
+    for item in row:
+        if isinstance(item, np.ndarray):
+            total += item.nbytes + sys.getsizeof(item)  # object data + object overhead
+        else:
+            total += sys.getsizeof(item)
+    return total
+
+
+def _get_row_mem_size_pa_table(table, row_index):
+    """Given a pyarrow table and a row index, return the memory size of that row.
+
+    Args:
+        table (pa.Table): the pyarrow table
+        row_index (int): the index of the row to measure
+
+    Returns:
+        int: the memory size of the row in bytes
+    """
+    total = 0
+
+    # Add the memory overhead of the row object itself.
+    total += sys.getsizeof(row_index)
+
+    # Then add the size of each item in the row.
+    for column in table.itercolumns():
+        item = column[row_index]
+        if isinstance(item, np.ndarray):
+            total += item.nbytes + sys.getsizeof(item)  # object data + object overhead
+        else:
+            total += sys.getsizeof(item.as_py())
+    return total
+
+
+def get_mem_size_per_row(data):
+    """Given a 2D array of data, return a list of memory sizes for each row in the chunk.
+
+    Args:
+        data (pd.DataFrame or pa.Table): the data chunk to measure
+
+    Returns:
+        list[int]: list of memory sizes for each row in the chunk
+    """
+    if isinstance(data, pd.DataFrame):
+        mem_sizes = [_get_row_mem_size_data_frame(row) for row in data.itertuples(index=False, name=None)]
+    elif isinstance(data, pa.Table):
+        mem_sizes = [_get_row_mem_size_pa_table(data, i) for i in range(data.num_rows)]
+    else:
+        raise NotImplementedError(f"Unsupported data type {type(data)} for memory size calculation")
+    return mem_sizes
diff --git a/src/hats/pixel_math/sparse_histogram.py b/src/hats/pixel_math/sparse_histogram.py
@@ -78,6 +78,18 @@ def from_file(cls, file_name):
         npzfile = np.load(file_name)
         return cls(npzfile["indexes"], npzfile["counts"], npzfile["order"])
 
+    def __eq__(self, value):
+        if not isinstance(value, SparseHistogram):
+            return False
+        return (
+            np.array_equal(self.indexes, value.indexes)
+            and np.array_equal(self.counts, value.counts)
+            and self.order == value.order
+        )
+
+    def __str__(self):
+        return f"Histogram at order {self.order}\n  - indexes: {self.indexes}\n  - values: {self.counts}"
+
 
 class HistogramAggregator:
     """Utility for aggregating sparse histograms."""
@@ -94,6 +106,8 @@ def add(self, other):
         other : SparseHistogram
             the wrapper containing the addend
         """
+        if other is None:
+            return
         if not isinstance(other, SparseHistogram):
             raise ValueError("Both addends should be SparseHistogram.")
         if self.order != other.order:
@@ -109,3 +123,51 @@ def to_sparse(self):
         indexes = self.full_histogram.nonzero()[0]
         counts = self.full_histogram[indexes]
         return SparseHistogram(indexes, counts, self.order)
+
+
+def supplemental_count_histogram(mapped_pixels, supplemental_count, highest_order):
+    """Specialized method for getting a histogram of some supplemental count,
+    collating according to the pixels in the first argument.
+
+    Typically used during import, when you wish to partition according to some supplemental
+    data, such as in-memory size, or length of a nested column.
+
+    Parameters
+    ----------
+    mapped_pixels : array_like of int
+        1-D array of healpix pixel IDs. Values will be
+        aggregated by pixel to produce the row-count histogram.
+    supplemental_count : None or array_like of int
+        Optional 1-D array of supplemental counts (for example per-row memory
+        sizes or nested-column lengths). If ``None``, no supplemental histogram
+        will be produced and the returned second element will be ``None``.
+    highest_order : int
+        Healpix order used for the histograms.
+
+    Returns
+    -------
+    tuple
+        ``(row_count_histo, supplemental_count_histo)`` where both elements are
+        :class:`SparseHistogram`. ``row_count_histo`` contains counts of rows
+        per pixel. ``supplemental_count_histo`` contains the sum of the
+        supplemental counts per pixel, or ``None`` if ``supplemental_count`` was
+        ``None``.
+    """
+
+    mapped_pixel, unique_inverse, count_at_pixel = np.unique(
+        mapped_pixels, return_counts=True, sorted=True, return_inverse=True
+    )
+    row_count_histo = SparseHistogram(mapped_pixel, count_at_pixel, highest_order)
+
+    supplemental_count_histo = None
+    if supplemental_count is not None:
+        if len(supplemental_count) != len(mapped_pixels):
+            raise ValueError("mapped pixels and supplemental counts must be the same length")
+        supplemental_sums = np.zeros(len(mapped_pixel), dtype=np.int64)
+
+        for index, supplemental_value in zip(unique_inverse, supplemental_count, strict=True):
+            supplemental_sums[index] += supplemental_value
+
+        supplemental_count_histo = SparseHistogram(mapped_pixel, supplemental_sums, highest_order)
+
+    return (row_count_histo, supplemental_count_histo)
diff --git a/tests/hats/io/test_size_estimates.py b/tests/hats/io/test_size_estimates.py
diff --git a/tests/hats/pixel_math/test_sparse_histogram.py b/tests/hats/pixel_math/test_sparse_histogram.py