Skip to content

Commit 64673b5

Browse files
Common utilities for size estimation and binning (#613)
* Common utilities for size estimation and binning * Test coverage. Fun methods. * Ooopsies - pylint. * More testing. Use `sorted` argument of `np.unique`. * Apply suggestions from code review Co-authored-by: Olivia Lynn <olynn@andrew.cmu.edu> * black formatting. --------- Co-authored-by: Olivia Lynn <olynn@andrew.cmu.edu>
1 parent 427830a commit 64673b5

File tree

6 files changed

+432
-19
lines changed

6 files changed

+432
-19
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies = [
2626
"mocpy>=0.19.0",
2727
"nested-pandas>=0.4.1",
2828
"numba>=0.58",
29-
"numpy>=2,<3",
29+
"numpy>=2.3,<3",
3030
"pandas>=2.0",
3131
# NOTE: package PINNED at:
3232
# !=19.0.0 due to https://github.com/astronomy-commons/hats/pull/516

src/hats/catalog/dataset/table_properties.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from upath import UPath
1111

1212
from hats.catalog.catalog_type import CatalogType
13-
from hats.io import file_io
13+
from hats.io import file_io, size_estimates
1414

1515
## catalog_name, catalog_type, and total_rows are required for ALL types
1616
CATALOG_TYPE_REQUIRED_FIELDS = {
@@ -342,18 +342,6 @@ def new_provenance_dict(
342342
dict
343343
A dictionary with properties for the HATS catalog.
344344
"""
345-
346-
def _estimate_dir_size(target_dir):
347-
total_size = 0
348-
for item in target_dir.iterdir():
349-
if item.is_dir():
350-
total_size += _estimate_dir_size(item)
351-
else:
352-
total_size += item.stat().st_size
353-
return total_size
354-
355-
path = file_io.get_upath(path)
356-
357345
builder_str = ""
358346
if builder is not None:
359347
builder_str = f"{builder}, "
@@ -363,7 +351,7 @@ def _estimate_dir_size(target_dir):
363351
now = datetime.now(tz=timezone.utc)
364352
properties["hats_builder"] = builder_str
365353
properties["hats_creation_date"] = now.strftime("%Y-%m-%dT%H:%M%Z")
366-
properties["hats_estsize"] = int(_estimate_dir_size(path) / 1024) if path else 0
354+
properties["hats_estsize"] = size_estimates.estimate_dir_size(path, divisor=1024)
367355
properties["hats_release_date"] = "2025-08-22"
368356
properties["hats_version"] = "v1.0"
369357
return kwargs | properties

src/hats/io/size_estimates.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""General utilities for estimating size of input and output."""
2+
3+
import sys
4+
from pathlib import Path
5+
6+
import numpy as np
7+
import pandas as pd
8+
import pyarrow as pa
9+
from upath import UPath
10+
11+
from hats.io import file_io
12+
13+
14+
def estimate_dir_size(path: str | Path | UPath | None = None, *, divisor=1):
15+
"""Estimate the disk usage of a directory, and recursive contents.
16+
17+
When divisor == 1, returns size in bytes."""
18+
path = file_io.get_upath(path)
19+
if path is None:
20+
return 0
21+
22+
def _estimate_dir_size(target_dir):
23+
total_size = 0
24+
for item in target_dir.iterdir():
25+
if item.is_dir():
26+
total_size += _estimate_dir_size(item)
27+
else:
28+
total_size += item.stat().st_size
29+
return total_size
30+
31+
est_size = _estimate_dir_size(path)
32+
if divisor > 1:
33+
return int(est_size / divisor)
34+
return est_size
35+
36+
37+
def _get_row_mem_size_data_frame(row):
38+
"""Given a pandas dataframe row (as a tuple), return the memory size of that row.
39+
40+
Args:
41+
row (tuple): the row from the dataframe
42+
43+
Returns:
44+
int: the memory size of the row in bytes
45+
"""
46+
total = 0
47+
48+
# Add the memory overhead of the row object itself.
49+
total += sys.getsizeof(row)
50+
51+
# Then add the size of each item in the row.
52+
for item in row:
53+
if isinstance(item, np.ndarray):
54+
total += item.nbytes + sys.getsizeof(item) # object data + object overhead
55+
else:
56+
total += sys.getsizeof(item)
57+
return total
58+
59+
60+
def _get_row_mem_size_pa_table(table, row_index):
61+
"""Given a pyarrow table and a row index, return the memory size of that row.
62+
63+
Args:
64+
table (pa.Table): the pyarrow table
65+
row_index (int): the index of the row to measure
66+
67+
Returns:
68+
int: the memory size of the row in bytes
69+
"""
70+
total = 0
71+
72+
# Add the memory overhead of the row object itself.
73+
total += sys.getsizeof(row_index)
74+
75+
# Then add the size of each item in the row.
76+
for column in table.itercolumns():
77+
item = column[row_index]
78+
if isinstance(item, np.ndarray):
79+
total += item.nbytes + sys.getsizeof(item) # object data + object overhead
80+
else:
81+
total += sys.getsizeof(item.as_py())
82+
return total
83+
84+
85+
def get_mem_size_per_row(data):
86+
"""Given a 2D array of data, return a list of memory sizes for each row in the chunk.
87+
88+
Args:
89+
data (pd.DataFrame or pa.Table): the data chunk to measure
90+
91+
Returns:
92+
list[int]: list of memory sizes for each row in the chunk
93+
"""
94+
if isinstance(data, pd.DataFrame):
95+
mem_sizes = [_get_row_mem_size_data_frame(row) for row in data.itertuples(index=False, name=None)]
96+
elif isinstance(data, pa.Table):
97+
mem_sizes = [_get_row_mem_size_pa_table(data, i) for i in range(data.num_rows)]
98+
else:
99+
raise NotImplementedError(f"Unsupported data type {type(data)} for memory size calculation")
100+
return mem_sizes

src/hats/pixel_math/sparse_histogram.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,18 @@ def from_file(cls, file_name):
7878
npzfile = np.load(file_name)
7979
return cls(npzfile["indexes"], npzfile["counts"], npzfile["order"])
8080

81+
def __eq__(self, value):
82+
if not isinstance(value, SparseHistogram):
83+
return False
84+
return (
85+
np.array_equal(self.indexes, value.indexes)
86+
and np.array_equal(self.counts, value.counts)
87+
and self.order == value.order
88+
)
89+
90+
def __str__(self):
91+
return f"Histogram at order {self.order}\n - indexes: {self.indexes}\n - values: {self.counts}"
92+
8193

8294
class HistogramAggregator:
8395
"""Utility for aggregating sparse histograms."""
@@ -94,6 +106,8 @@ def add(self, other):
94106
other : SparseHistogram
95107
the wrapper containing the addend
96108
"""
109+
if other is None:
110+
return
97111
if not isinstance(other, SparseHistogram):
98112
raise ValueError("Both addends should be SparseHistogram.")
99113
if self.order != other.order:
@@ -109,3 +123,51 @@ def to_sparse(self):
109123
indexes = self.full_histogram.nonzero()[0]
110124
counts = self.full_histogram[indexes]
111125
return SparseHistogram(indexes, counts, self.order)
126+
127+
128+
def supplemental_count_histogram(mapped_pixels, supplemental_count, highest_order):
129+
"""Specialized method for getting a histogram of some supplemental count,
130+
collating according to the pixels in the first argument.
131+
132+
Typically used during import, when you wish to partition according to some supplemental
133+
data, such as in-memory size, or length of a nested column.
134+
135+
Parameters
136+
----------
137+
mapped_pixels : array_like of int
138+
1-D array of healpix pixel IDs. Values will be
139+
aggregated by pixel to produce the row-count histogram.
140+
supplemental_count : None or array_like of int
141+
Optional 1-D array of supplemental counts (for example per-row memory
142+
sizes or nested-column lengths). If ``None``, no supplemental histogram
143+
will be produced and the returned second element will be ``None``.
144+
highest_order : int
145+
Healpix order used for the histograms.
146+
147+
Returns
148+
-------
149+
tuple
150+
``(row_count_histo, supplemental_count_histo)`` where both elements are
151+
:class:`SparseHistogram`. ``row_count_histo`` contains counts of rows
152+
per pixel. ``supplemental_count_histo`` contains the sum of the
153+
supplemental counts per pixel, or ``None`` if ``supplemental_count`` was
154+
``None``.
155+
"""
156+
157+
mapped_pixel, unique_inverse, count_at_pixel = np.unique(
158+
mapped_pixels, return_counts=True, sorted=True, return_inverse=True
159+
)
160+
row_count_histo = SparseHistogram(mapped_pixel, count_at_pixel, highest_order)
161+
162+
supplemental_count_histo = None
163+
if supplemental_count is not None:
164+
if len(supplemental_count) != len(mapped_pixels):
165+
raise ValueError("mapped pixels and supplemental counts must be the same length")
166+
supplemental_sums = np.zeros(len(mapped_pixel), dtype=np.int64)
167+
168+
for index, supplemental_value in zip(unique_inverse, supplemental_count, strict=True):
169+
supplemental_sums[index] += supplemental_value
170+
171+
supplemental_count_histo = SparseHistogram(mapped_pixel, supplemental_sums, highest_order)
172+
173+
return (row_count_histo, supplemental_count_histo)

0 commit comments

Comments
 (0)