Add a check for malformed manifest files

mtauraso · mtauraso · commit ae52b7ed30d2 · 2025-03-24T14:22:56.000-07:00
- When manifest or filter_catalog files contain filenames that are obviously for the
  wrong object ID, we will drop the relevant object and emit a warning
- Testing system extended to allow tests for malformed manifests to be written.
- Small fixup to .setup_dev.sh script to make sure that it's promise to "not output logs"
  unless there is an error is actually true.
diff --git a/.setup_dev.sh b/.setup_dev.sh
@@ -34,7 +34,7 @@ python -m pip install -e . > /dev/null
 echo "Installing developer dependencies in local environment"
 echo "This might take a few minutes. Only errors will be printed to stdout"
 python -m pip install -e .'[dev]' > /dev/null
-if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
+if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
 
 echo "Installing pre-commit"
 pre-commit install > /dev/null
diff --git a/src/fibad/data_sets/hsc_data_set.py b/src/fibad/data_sets/hsc_data_set.py
@@ -413,15 +413,38 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in
                 logger.info(f"Filters for object {object_id} were {filters_unsorted}")
                 logger.debug(f"Reference filters were {filters_ref}")
 
-            # Drop objects that can't meet the coutout size provided
             elif cutout_shape is not None:
+                # Drop objects that can't meet the cutout size provided
                 for shape in self.dims[object_id]:
                     if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]:
                         msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
                         msg += " this is too small for the given cutout size of "
                         msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)"
                         self._mark_for_prune(object_id, msg)
                         break
+
+                # Drop objects where the cutouts are not the same size
+                first_shape = None
+                for shape in self.dims[object_id]:
+                    first_shape = shape if first_shape is None else first_shape
+                    if shape != first_shape:
+                        msg = f"The first filter for object {object_id} has a shape of "
+                        msg += f"({first_shape[0]}px,{first_shape[1]}px) another filter has shape of"
+                        msg += f"({shape[0]}px,{shape[1]}px)"
+                        self._mark_for_prune(object_id, msg)
+                        break
+
+                # Drop objects where parsing the filenames does not reveal the object IDs
+                for filter, filepath in filters_unsorted.items():
+                    filename = Path(filepath).name
+                    # Check beginning of filename vs object_id
+                    if filename[:17] != object_id:
+                        msg = f"Filter {filter} for object id {object_id} has filename {filepath} listed"
+                        msg += "The filename does not match the object_id, and the filter_catalog or "
+                        msg += "manifest is likely corrupt."
+                        self._mark_for_prune(object_id, msg)
+                        break
+
             if index != 0 and index % 1_000_000 == 0:
                 logger.info(f"Processed {index} objects for pruning")
         else:
diff --git a/tests/fibad/test_hsc_dataset.py b/tests/fibad/test_hsc_dataset.py
@@ -1,12 +1,14 @@
 import logging
 import unittest.mock as mock
 from pathlib import Path
+from typing import Any, Optional
 
 import numpy as np
 import pytest
-from fibad.data_sets.hsc_data_set import HSCDataSet
 from torchvision.transforms.v2 import CenterCrop, Lambda
 
+from fibad.data_sets.hsc_data_set import HSCDataSet
+
 test_dir = Path(__file__).parent / "test_data" / "dataloader"
 
 HSCDataSet._called_from_test = True
@@ -27,8 +29,8 @@ class FakeFitsFS:
     more filesystem operations without a really good reason.
     """
 
-    def __init__(self, test_files: dict):
-        self.patchers: list[mock._patch[mock.Mock]] = []
+    def __init__(self, test_files: dict, filter_catalog: Optional[dict] = None):
+        self.patchers: list[mock._patch[Any]] = []
 
         self.test_files = test_files
 
@@ -39,6 +41,17 @@ def __init__(self, test_files: dict):
         mock_fits_open = mock.Mock(side_effect=self._open_file)
         self.patchers.append(mock.patch("astropy.io.fits.open", mock_fits_open))
 
+        if filter_catalog is not None:
+            mock_read_filter_catalog = mock.patch(
+                "fibad.data_sets.hsc_data_set.HSCDataSet._read_filter_catalog",
+                lambda x, y: "Not a real table",
+            )
+            self.patchers.append(mock_read_filter_catalog)
+            mock_parse_filter_catalog = mock.patch(
+                "fibad.data_sets.hsc_data_set.HSCDataSet._parse_filter_catalog", lambda x, y: filter_catalog
+            )
+            self.patchers.append(mock_parse_filter_catalog)
+
     def _open_file(self, filename: Path, **kwargs) -> mock.Mock:
         shape = self.test_files[filename.name]
         mock_open_ctx = mock.Mock()
@@ -117,6 +130,42 @@ def generate_files(
     return test_files
 
 
+def generate_filter_catalog(test_files: dict, config: dict) -> dict:
+    """Generates a filter catalog dict for use with FakeFitsFS from a filesystem dictionary
+    created by generate_files.
+
+    This allows tests to alter the parsed filter_catalog, and interrogate what decisions HSCDataSet makes
+    when a manifest or filter_catalog file contains corrupt information.
+
+    Caveats:
+    Always Run this prior to creating any mocks, or using a FakeFitsFS
+
+    This function calls the HSCDataSet parsing code to actually assemble the files array, which means:
+    1) This test setup is fairly tightly coupled to the internals of HSCDataSet
+    2) When writing tests using this, ensure any functionality you depend on from the slow file-scan
+       initialization codepath is tested indepently.
+
+    Parameters
+    ----------
+    test_files : dict
+        Test files dictionary created with generate_files.
+    config : dict
+        The config you will use to initialize the test HSCDataSet object in your test.
+        Create this with mkconfig()
+
+    Returns
+    -------
+    dict
+        Dictionary from ObjectID -> (Dictionary from Filter -> Filename)
+    """
+    # Ensure the filter_catalog codepath won't be triggered
+    config["data_set"]["filter_catalog"] = False
+
+    # Use our initialization code to create a parsed files object
+    with FakeFitsFS(test_files):
+        return HSCDataSet(config).files
+
+
 def test_load(caplog):
     """Test to ensure loading a perfectly regular set of files works"""
     caplog.set_level(logging.WARNING)
@@ -313,6 +362,52 @@ def test_prune_size(caplog):
         assert "too small" in caplog.text
 
 
+def test_prune_filter_size_mismatch(caplog):
+    """Test to ensure images with different sizes per filter will be dropped"""
+    caplog.set_level(logging.WARNING)
+    test_files = {}
+    test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
+    test_files["00000000000000000_all_filters_HSC-R.fits"] = (99, 99)
+
+    with FakeFitsFS(test_files):
+        a = HSCDataSet(mkconfig(crop_to=(99, 99)))
+
+        assert len(a) == 9
+        assert a.shape() == (5, 99, 99)
+
+        # We should warn that we are dropping objects and the reason
+        assert "Dropping object" in caplog.text
+        assert "first filter" in caplog.text
+
+
+def test_prune_bad_filename(caplog):
+    """Test to ensure images with filenames set wrong will be dropped"""
+    caplog.set_level(logging.WARNING)
+    test_files = {}
+    test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
+
+    config = mkconfig(crop_to=(99, 99), filter_catalog="notarealfile.fits")
+
+    # Create a filter catalog with wrong file information.
+    filter_catalog = generate_filter_catalog(test_files, config)
+    filters = list(filter_catalog["00000000000000000"].keys())
+    filter_catalog["00000000000000000"][filters[0]] = filter_catalog["00000000000000001"][filters[0]]
+
+    with FakeFitsFS(test_files, filter_catalog):
+        # Initialize HSCDataset exercising the filter_catalog provided initialization pathway
+        a = HSCDataSet(config)
+
+        # Verify that the broken object has been dropped
+        assert len(a) == 9
+
+        # Verify the shape is correct.
+        assert a.shape() == (5, 99, 99)
+
+        # We should warn that we are dropping objects and the correct reason
+        assert "Dropping object" in caplog.text
+        assert "manifest is likely corrupt" in caplog.text
+
+
 def test_partial_filter(caplog):
     """Test to ensure when we only load some of the filters, only those filters end up in the dataset"""
     caplog.set_level(logging.WARNING)