Skip to content

Commit 9f8ba01

Browse files
authored
Rename histogram directory (histograms/ -> row_count_histograms/) (#602)
* Rename histogram directory (histograms/ -> row_count_histograms/); retain legacy support * Remove legacy dir support in get_remaining_map_keys, as it's just for partials/intermediate files anyway * Remove all legacy support for old histogram dir name
1 parent 600cb41 commit 9f8ba01

File tree

4 files changed

+22
-18
lines changed

4 files changed

+22
-18
lines changed

src/hats_import/catalog/resume_plan.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@ class ResumePlan(PipelineResumePlan):
3939
SPLITTING_STAGE = "splitting"
4040
REDUCING_STAGE = "reducing"
4141

42-
HISTOGRAM_BINARY_FILE = "mapping_histogram.npz"
43-
HISTOGRAMS_DIR = "histograms"
42+
ROW_COUNT_HISTOGRAM_BINARY_FILE = "row_count_mapping_histogram.npz"
43+
ROW_COUNT_HISTOGRAMS_DIR = "row_count_histograms"
44+
4445
ALIGNMENT_FILE = "alignment.pickle"
4546

4647
# pylint: disable=too-many-arguments
@@ -114,7 +115,7 @@ def gather_plan(self, run_stages: list[str] | None = None):
114115
if self.should_run_mapping:
115116
self.map_files = self.get_remaining_map_keys()
116117
file_io.make_directory(
117-
file_io.append_paths_to_pointer(self.tmp_path, self.HISTOGRAMS_DIR),
118+
file_io.append_paths_to_pointer(self.tmp_path, self.ROW_COUNT_HISTOGRAMS_DIR),
118119
exist_ok=True,
119120
)
120121
if self.should_run_splitting:
@@ -144,7 +145,7 @@ def get_remaining_map_keys(self):
144145
Returns:
145146
list of mapping keys *not* found in files like /resume/path/mapping_key.npz
146147
"""
147-
prefix = file_io.get_upath(self.tmp_path) / self.HISTOGRAMS_DIR
148+
prefix = file_io.get_upath(self.tmp_path) / self.ROW_COUNT_HISTOGRAMS_DIR
148149
map_file_pattern = re.compile(r"map_(\d+).npz")
149150
done_indexes = [int(map_file_pattern.match(path.name).group(1)) for path in prefix.glob("*.npz")]
150151
remaining_indexes = list(set(range(0, len(self.input_paths))) - (set(done_indexes)))
@@ -157,24 +158,27 @@ def read_histogram(self, healpix_order):
157158
- Otherwise, combine histograms from partials
158159
- Otherwise, return an empty histogram
159160
"""
160-
file_name = file_io.append_paths_to_pointer(self.tmp_path, self.HISTOGRAM_BINARY_FILE)
161+
file_name = file_io.append_paths_to_pointer(self.tmp_path, self.ROW_COUNT_HISTOGRAM_BINARY_FILE)
162+
163+
# Otherwise, read the histogram from partial histograms and combine.
161164
if not file_io.does_file_or_directory_exist(file_name):
162-
# Read the histogram from partial histograms and combine.
163165
remaining_map_files = self.get_remaining_map_keys()
164166
if len(remaining_map_files) > 0:
165167
raise RuntimeError(f"{len(remaining_map_files)} map stages did not complete successfully.")
166-
histogram_files = file_io.find_files_matching_path(self.tmp_path, self.HISTOGRAMS_DIR, "*.npz")
168+
histogram_files = file_io.find_files_matching_path(
169+
self.tmp_path, self.ROW_COUNT_HISTOGRAMS_DIR, "*.npz"
170+
)
167171
aggregate_histogram = HistogramAggregator(healpix_order)
168172
for partial_file_name in histogram_files:
169173
partial = SparseHistogram.from_file(partial_file_name)
170174
aggregate_histogram.add(partial)
171175

172-
file_name = file_io.append_paths_to_pointer(self.tmp_path, self.HISTOGRAM_BINARY_FILE)
176+
file_name = file_io.append_paths_to_pointer(self.tmp_path, self.ROW_COUNT_HISTOGRAM_BINARY_FILE)
173177
with open(file_name, "wb+") as file_handle:
174178
file_handle.write(aggregate_histogram.full_histogram)
175179
if self.delete_resume_log_files:
176180
file_io.remove_directory(
177-
file_io.append_paths_to_pointer(self.tmp_path, self.HISTOGRAMS_DIR),
181+
file_io.append_paths_to_pointer(self.tmp_path, self.ROW_COUNT_HISTOGRAMS_DIR),
178182
ignore_errors=True,
179183
)
180184

@@ -200,10 +204,10 @@ def partial_histogram_file(cls, tmp_path, mapping_key: str):
200204
mapping_key (str): unique string for each mapping task (e.g. "map_57")
201205
"""
202206
file_io.make_directory(
203-
file_io.append_paths_to_pointer(tmp_path, cls.HISTOGRAMS_DIR),
207+
file_io.append_paths_to_pointer(tmp_path, cls.ROW_COUNT_HISTOGRAMS_DIR),
204208
exist_ok=True,
205209
)
206-
return file_io.append_paths_to_pointer(tmp_path, cls.HISTOGRAMS_DIR, f"{mapping_key}.npz")
210+
return file_io.append_paths_to_pointer(tmp_path, cls.ROW_COUNT_HISTOGRAMS_DIR, f"{mapping_key}.npz")
207211

208212
def get_remaining_split_keys(self):
209213
"""Gather remaining keys, dropping successful split tasks from done file names.
@@ -266,7 +270,7 @@ def get_alignment_file(
266270
highest_healpix_order (int): the highest healpix order (e.g. 5-10)
267271
lowest_healpix_order (int): the lowest healpix order (e.g. 1-5). specifying a lowest order
268272
constrains the partitioning to prevent spatially large pixels.
269-
threshold (int): the maximum number of objects allowed in a single pixel
273+
pixel_threshold (int): the maximum number of objects allowed in a single pixel
270274
drop_empty_siblings (bool): if 3 of 4 pixels are empty, keep only the non-empty pixel
271275
expected_total_rows (int): number of expected rows found in the dataset.
272276

tests/hats_import/catalog/test_map_reduce.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,14 @@ def test_read_bad_fileformat(blank_data_file, capsys, tmp_path):
8787

8888
def read_partial_histogram(tmp_path, mapping_key):
8989
"""Helper to read in the former result of a map operation."""
90-
histogram_file = tmp_path / "histograms" / f"{mapping_key}.npz"
90+
histogram_file = tmp_path / "row_count_histograms" / f"{mapping_key}.npz"
9191
hist = SparseHistogram.from_file(histogram_file)
9292
return hist.to_array()
9393

9494

9595
def test_read_single_fits(tmp_path, formats_fits):
9696
"""Success case - fits file that exists being read as fits"""
97-
(tmp_path / "histograms").mkdir(parents=True)
97+
(tmp_path / "row_count_histograms").mkdir(parents=True)
9898
mr.map_to_pixels(
9999
input_file=formats_fits,
100100
pickled_reader_file=pickle_file_reader(tmp_path, get_file_reader("fits")),

tests/hats_import/catalog/test_run_import.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_resume_dask_runner_diff_pixel_order(
144144
## Now set up our resume files to match previous work.
145145
resume_tmp = tmp_path / "tmp" / "resume_catalog"
146146
ResumePlan(tmp_path=resume_tmp, progress_bar=False)
147-
SparseHistogram([11], [131], 0).to_dense_file(resume_tmp / "mapping_histogram.npz")
147+
SparseHistogram([11], [131], 0).to_dense_file(resume_tmp / "row_count_mapping_histogram.npz")
148148
for file_index in range(0, 5):
149149
ResumePlan.touch_key_done_file(resume_tmp, ResumePlan.SPLITTING_STAGE, f"split_{file_index}")
150150

tests/hats_import/catalog/test_run_round_trip.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -551,20 +551,20 @@ def assert_stage_level_files_exist(base_intermediate_dir):
551551
# `small_sky_object_catalog` at order 0.
552552
expected_contents = [
553553
"alignment.pickle",
554-
"histograms", # directory containing sub-histograms
555554
"input_paths.txt", # original input paths for subsequent comparison
556555
"mapping_done", # stage-level done file
557-
"mapping_histogram.npz", # concatenated histogram file
558556
"order_0", # all intermediate parquet files
559557
"reader.pickle", # pickled InputReader
560558
"reducing", # directory containing task-level done files
561559
"reducing_done", # stage-level done file
560+
"row_count_histograms", # directory containing sub-histograms
561+
"row_count_mapping_histogram.npz", # concatenated histogram file
562562
"splitting", # directory containing task-level done files
563563
"splitting_done", # stage-level done file
564564
]
565565
assert_directory_contains(base_intermediate_dir, expected_contents)
566566

567-
checking_dir = base_intermediate_dir / "histograms"
567+
checking_dir = base_intermediate_dir / "row_count_histograms"
568568
assert_directory_contains(
569569
checking_dir, ["map_0.npz", "map_1.npz", "map_2.npz", "map_3.npz", "map_4.npz", "map_5.npz"]
570570
)

0 commit comments

Comments
 (0)