Skip to content

Commit ae52b7e

Browse files
committed
Add a check for malformed manifest files
- When manifest or filter_catalog files contain filenames that are obviously for the wrong object ID, we will drop the relevant object and emit a warning - Testing system extended to allow tests for malformed manifests to be written. - Small fixup to .setup_dev.sh script to make sure that it's promise to "not output logs" unless there is an error is actually true.
1 parent 99b9646 commit ae52b7e

File tree

3 files changed

+123
-5
lines changed

3 files changed

+123
-5
lines changed

.setup_dev.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ python -m pip install -e . > /dev/null
3434
echo "Installing developer dependencies in local environment"
3535
echo "This might take a few minutes. Only errors will be printed to stdout"
3636
python -m pip install -e .'[dev]' > /dev/null
37-
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
37+
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
3838

3939
echo "Installing pre-commit"
4040
pre-commit install > /dev/null

src/fibad/data_sets/hsc_data_set.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,15 +413,38 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in
413413
logger.info(f"Filters for object {object_id} were {filters_unsorted}")
414414
logger.debug(f"Reference filters were {filters_ref}")
415415

416-
# Drop objects that can't meet the coutout size provided
417416
elif cutout_shape is not None:
417+
# Drop objects that can't meet the cutout size provided
418418
for shape in self.dims[object_id]:
419419
if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]:
420420
msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
421421
msg += " this is too small for the given cutout size of "
422422
msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)"
423423
self._mark_for_prune(object_id, msg)
424424
break
425+
426+
# Drop objects where the cutouts are not the same size
427+
first_shape = None
428+
for shape in self.dims[object_id]:
429+
first_shape = shape if first_shape is None else first_shape
430+
if shape != first_shape:
431+
msg = f"The first filter for object {object_id} has a shape of "
432+
msg += f"({first_shape[0]}px,{first_shape[1]}px) another filter has shape of"
433+
msg += f"({shape[0]}px,{shape[1]}px)"
434+
self._mark_for_prune(object_id, msg)
435+
break
436+
437+
# Drop objects where parsing the filenames does not reveal the object IDs
438+
for filter, filepath in filters_unsorted.items():
439+
filename = Path(filepath).name
440+
# Check beginning of filename vs object_id
441+
if filename[:17] != object_id:
442+
msg = f"Filter {filter} for object id {object_id} has filename {filepath} listed"
443+
msg += "The filename does not match the object_id, and the filter_catalog or "
444+
msg += "manifest is likely corrupt."
445+
self._mark_for_prune(object_id, msg)
446+
break
447+
425448
if index != 0 and index % 1_000_000 == 0:
426449
logger.info(f"Processed {index} objects for pruning")
427450
else:

tests/fibad/test_hsc_dataset.py

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import logging
22
import unittest.mock as mock
33
from pathlib import Path
4+
from typing import Any, Optional
45

56
import numpy as np
67
import pytest
7-
from fibad.data_sets.hsc_data_set import HSCDataSet
88
from torchvision.transforms.v2 import CenterCrop, Lambda
99

10+
from fibad.data_sets.hsc_data_set import HSCDataSet
11+
1012
test_dir = Path(__file__).parent / "test_data" / "dataloader"
1113

1214
HSCDataSet._called_from_test = True
@@ -27,8 +29,8 @@ class FakeFitsFS:
2729
more filesystem operations without a really good reason.
2830
"""
2931

30-
def __init__(self, test_files: dict):
31-
self.patchers: list[mock._patch[mock.Mock]] = []
32+
def __init__(self, test_files: dict, filter_catalog: Optional[dict] = None):
33+
self.patchers: list[mock._patch[Any]] = []
3234

3335
self.test_files = test_files
3436

@@ -39,6 +41,17 @@ def __init__(self, test_files: dict):
3941
mock_fits_open = mock.Mock(side_effect=self._open_file)
4042
self.patchers.append(mock.patch("astropy.io.fits.open", mock_fits_open))
4143

44+
if filter_catalog is not None:
45+
mock_read_filter_catalog = mock.patch(
46+
"fibad.data_sets.hsc_data_set.HSCDataSet._read_filter_catalog",
47+
lambda x, y: "Not a real table",
48+
)
49+
self.patchers.append(mock_read_filter_catalog)
50+
mock_parse_filter_catalog = mock.patch(
51+
"fibad.data_sets.hsc_data_set.HSCDataSet._parse_filter_catalog", lambda x, y: filter_catalog
52+
)
53+
self.patchers.append(mock_parse_filter_catalog)
54+
4255
def _open_file(self, filename: Path, **kwargs) -> mock.Mock:
4356
shape = self.test_files[filename.name]
4457
mock_open_ctx = mock.Mock()
@@ -117,6 +130,42 @@ def generate_files(
117130
return test_files
118131

119132

133+
def generate_filter_catalog(test_files: dict, config: dict) -> dict:
134+
"""Generates a filter catalog dict for use with FakeFitsFS from a filesystem dictionary
135+
created by generate_files.
136+
137+
This allows tests to alter the parsed filter_catalog, and interrogate what decisions HSCDataSet makes
138+
when a manifest or filter_catalog file contains corrupt information.
139+
140+
Caveats:
141+
Always Run this prior to creating any mocks, or using a FakeFitsFS
142+
143+
This function calls the HSCDataSet parsing code to actually assemble the files array, which means:
144+
1) This test setup is fairly tightly coupled to the internals of HSCDataSet
145+
2) When writing tests using this, ensure any functionality you depend on from the slow file-scan
146+
initialization codepath is tested indepently.
147+
148+
Parameters
149+
----------
150+
test_files : dict
151+
Test files dictionary created with generate_files.
152+
config : dict
153+
The config you will use to initialize the test HSCDataSet object in your test.
154+
Create this with mkconfig()
155+
156+
Returns
157+
-------
158+
dict
159+
Dictionary from ObjectID -> (Dictionary from Filter -> Filename)
160+
"""
161+
# Ensure the filter_catalog codepath won't be triggered
162+
config["data_set"]["filter_catalog"] = False
163+
164+
# Use our initialization code to create a parsed files object
165+
with FakeFitsFS(test_files):
166+
return HSCDataSet(config).files
167+
168+
120169
def test_load(caplog):
121170
"""Test to ensure loading a perfectly regular set of files works"""
122171
caplog.set_level(logging.WARNING)
@@ -313,6 +362,52 @@ def test_prune_size(caplog):
313362
assert "too small" in caplog.text
314363

315364

365+
def test_prune_filter_size_mismatch(caplog):
366+
"""Test to ensure images with different sizes per filter will be dropped"""
367+
caplog.set_level(logging.WARNING)
368+
test_files = {}
369+
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
370+
test_files["00000000000000000_all_filters_HSC-R.fits"] = (99, 99)
371+
372+
with FakeFitsFS(test_files):
373+
a = HSCDataSet(mkconfig(crop_to=(99, 99)))
374+
375+
assert len(a) == 9
376+
assert a.shape() == (5, 99, 99)
377+
378+
# We should warn that we are dropping objects and the reason
379+
assert "Dropping object" in caplog.text
380+
assert "first filter" in caplog.text
381+
382+
383+
def test_prune_bad_filename(caplog):
384+
"""Test to ensure images with filenames set wrong will be dropped"""
385+
caplog.set_level(logging.WARNING)
386+
test_files = {}
387+
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
388+
389+
config = mkconfig(crop_to=(99, 99), filter_catalog="notarealfile.fits")
390+
391+
# Create a filter catalog with wrong file information.
392+
filter_catalog = generate_filter_catalog(test_files, config)
393+
filters = list(filter_catalog["00000000000000000"].keys())
394+
filter_catalog["00000000000000000"][filters[0]] = filter_catalog["00000000000000001"][filters[0]]
395+
396+
with FakeFitsFS(test_files, filter_catalog):
397+
# Initialize HSCDataset exercising the filter_catalog provided initialization pathway
398+
a = HSCDataSet(config)
399+
400+
# Verify that the broken object has been dropped
401+
assert len(a) == 9
402+
403+
# Verify the shape is correct.
404+
assert a.shape() == (5, 99, 99)
405+
406+
# We should warn that we are dropping objects and the correct reason
407+
assert "Dropping object" in caplog.text
408+
assert "manifest is likely corrupt" in caplog.text
409+
410+
316411
def test_partial_filter(caplog):
317412
"""Test to ensure when we only load some of the filters, only those filters end up in the dataset"""
318413
caplog.set_level(logging.WARNING)

0 commit comments

Comments
 (0)