Skip to content

Commit 2344e5f

Browse files
committed
Add a check for malformed manifest files
- When manifest or filter_catalog files contain filenames that are obviously for the wrong object ID, we will drop the relevant object and emit a warning - Testing system extended to allow tests for malformed manifests to be written. - Small fixup to .setup_dev.sh script to make sure that it's promise to "not output logs" unless there is an error is actually true.
1 parent 9bc0bae commit 2344e5f

File tree

3 files changed

+99
-5
lines changed

3 files changed

+99
-5
lines changed

.setup_dev.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ python -m pip install -e . > /dev/null
3434
echo "Installing developer dependencies in local environment"
3535
echo "This might take a few minutes. Only errors will be printed to stdout"
3636
python -m pip install -e .'[dev]' > /dev/null
37-
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
37+
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
3838

3939
echo "Installing pre-commit"
4040
pre-commit install > /dev/null

src/fibad/data_sets/hsc_data_set.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,15 +389,38 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in
389389
logger.info(f"Filters for object {object_id} were {filters_unsorted}")
390390
logger.debug(f"Reference filters were {filters_ref}")
391391

392-
# Drop objects that can't meet the coutout size provided
393392
elif cutout_shape is not None:
393+
# Drop objects that can't meet the cutout size provided
394394
for shape in self.dims[object_id]:
395395
if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]:
396396
msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
397397
msg += " this is too small for the given cutout size of "
398398
msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)"
399399
self._mark_for_prune(object_id, msg)
400400
break
401+
402+
# Drop objects where the cutouts are not the same size
403+
first_shape = None
404+
for shape in self.dims[object_id]:
405+
first_shape = shape if first_shape is None else first_shape
406+
if shape != first_shape:
407+
msg = f"The first filter for object {object_id} has a shape of "
408+
msg += f"({first_shape[0]}px,{first_shape[1]}px) another filter has shape of"
409+
msg += f"({shape[0]}px,{shape[1]}px)"
410+
self._mark_for_prune(object_id, msg)
411+
break
412+
413+
# Drop objects where parsing the filenames does not reveal the object IDs
414+
for filter, filepath in filters_unsorted.items():
415+
filename = Path(filepath).name
416+
# Check beginning of filename vs object_id
417+
if filename[:17] != object_id:
418+
msg = f"Filter {filter} for object id {object_id} has filename {filepath} listed"
419+
msg += "The filename does not match the object_id, and the filter_catalog or "
420+
msg += "manifest is likely corrupt."
421+
self._mark_for_prune(object_id, msg)
422+
break
423+
401424
if index != 0 and index % 1_000_000 == 0:
402425
logger.info(f"Processed {index} objects for pruning")
403426
else:

tests/fibad/test_hsc_dataset.py

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import logging
22
import unittest.mock as mock
33
from pathlib import Path
4+
from typing import Any, Optional
45

56
import numpy as np
67
import pytest
7-
from fibad.data_sets.hsc_data_set import HSCDataSet
88
from torchvision.transforms.v2 import CenterCrop, Lambda
99

10+
from fibad.data_sets.hsc_data_set import HSCDataSet
11+
1012
test_dir = Path(__file__).parent / "test_data" / "dataloader"
1113

1214
HSCDataSet._called_from_test = True
@@ -27,8 +29,8 @@ class FakeFitsFS:
2729
more filesystem operations without a really good reason.
2830
"""
2931

30-
def __init__(self, test_files: dict):
31-
self.patchers: list[mock._patch[mock.Mock]] = []
32+
def __init__(self, test_files: dict, filter_catalog: Optional[dict] = None):
33+
self.patchers: list[mock._patch[Any]] = []
3234

3335
self.test_files = test_files
3436

@@ -39,6 +41,12 @@ def __init__(self, test_files: dict):
3941
mock_fits_open = mock.Mock(side_effect=self._open_file)
4042
self.patchers.append(mock.patch("astropy.io.fits.open", mock_fits_open))
4143

44+
if filter_catalog is not None:
45+
mock_read_filter_catalog = mock.patch(
46+
"fibad.data_sets.hsc_data_set.HSCDataSet._read_filter_catalog", lambda x, y: filter_catalog
47+
)
48+
self.patchers.append(mock_read_filter_catalog)
49+
4250
def _open_file(self, filename: Path, **kwargs) -> mock.Mock:
4351
shape = self.test_files[filename.name]
4452
mock_open_ctx = mock.Mock()
@@ -117,6 +125,24 @@ def generate_files(
117125
return test_files
118126

119127

128+
def generate_filter_catalog(test_files: dict) -> dict:
129+
"""Generates a filter catalog dict for use with FakeFitsFS from a filesystem dictionary
130+
created by generate_files.
131+
132+
This allows tests to alter the parsed filter_catalog, and interrogate what decisions HSCDataSet makes
133+
when a manifest or filter_catalog file contains corrupt information.
134+
135+
Returns
136+
-------
137+
dict
138+
Dictionary from ObjectID -> (Dicttionary from Filter -> Filename)
139+
"""
140+
141+
# Use our initialization code to create a parsed files object
142+
with FakeFitsFS(test_files):
143+
return HSCDataSet(mkconfig(crop_to=(99, 99))).files
144+
145+
120146
def test_load(caplog):
121147
"""Test to ensure loading a perfectly regular set of files works"""
122148
caplog.set_level(logging.WARNING)
@@ -313,6 +339,51 @@ def test_prune_size(caplog):
313339
assert "too small" in caplog.text
314340

315341

342+
def test_prune_filter_size_mismatch(caplog):
343+
"""Test to ensure images with different sizes per filter will be dropped"""
344+
caplog.set_level(logging.WARNING)
345+
test_files = {}
346+
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
347+
test_files["00000000000000000_all_filters_HSC-R.fits"] = (99, 99)
348+
print(test_files)
349+
350+
with FakeFitsFS(test_files):
351+
a = HSCDataSet(mkconfig(crop_to=(99, 99)))
352+
353+
assert len(a) == 9
354+
assert a.shape() == (5, 99, 99)
355+
356+
# We should warn that we are dropping objects and the reason
357+
assert "Dropping object" in caplog.text
358+
assert "first filter" in caplog.text
359+
360+
361+
def test_prune_bad_filename(caplog):
362+
"""Test to ensure images with filenames set wrong will be dropped"""
363+
caplog.set_level(logging.WARNING)
364+
test_files = {}
365+
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
366+
367+
# Create a filter catalog with wrong file information.
368+
filter_catalog = generate_filter_catalog(test_files)
369+
filters = list(filter_catalog["00000000000000000"].keys())
370+
filter_catalog["00000000000000000"][filters[0]] = filter_catalog["00000000000000001"][filters[0]]
371+
372+
with FakeFitsFS(test_files, filter_catalog):
373+
# Initialize HSCDataset exercising the filter_catalog provided initialization pathway
374+
a = HSCDataSet(mkconfig(crop_to=(99, 99), filter_catalog="notarealfile.fits"))
375+
376+
# Verify that the broken object has been dropped
377+
assert len(a) == 9
378+
379+
# Verify the shape is correct.
380+
assert a.shape() == (5, 99, 99)
381+
382+
# We should warn that we are dropping objects and the correct reason
383+
assert "Dropping object" in caplog.text
384+
assert "manifest is likely corrupt" in caplog.text
385+
386+
316387
def test_partial_filter(caplog):
317388
"""Test to ensure when we only load some of the filters, only those filters end up in the dataset"""
318389
caplog.set_level(logging.WARNING)

0 commit comments

Comments
 (0)