bugfix [data_loader]: fix normalization in the dataloader

Alexander Hillsley · Alexander Hillsley · commit 82a73004b9c7 · 2025-12-01T17:24:09.000-08:00
Previous: normalized by the mean of all pixels, including both channels,
this resulted in a tiny range of values for the phase channel and a very unusual distribtuion
for the fluorescent channel

Fix: For the phase channel do nothing, values are already 0-centered and have a std of roughly 0.25. For the fluorescent
channel take the log o the image to squash the long tail, then zero-mean and standardize
diff --git a/src/ops_model/data/data_loader.py b/src/ops_model/data/data_loader.py
@@ -211,46 +211,20 @@ def __init__(
         )
         return
 
-    def _normalize_data(self, ci, channel_names, data, masks):
-
-        # Temporary Fix
-        # normalize per crop and squash all values between -1 and 1
-        data_shift = data - np.mean(data)
-        lo, hi = np.percentile(data_shift, [1, 99.5])
-        scale = max(abs(lo), abs(hi))  # symmetric mapping
-        data_norm = np.clip(data_shift, -scale, scale) / scale
-        if self.cell_masks:
-            data_norm = data_norm * masks
-
-        # fov_attrs = self.stores[ci.store_key][
-        #     ci.tile_pheno
-        # ].zattrs.asdict()  # can create dict for all tiles at beginning
-
-        # # TODO: need a real measure of dataset background
-        # bg = [np.percentile(data, 1)]
-
-        # iqrs = [
-        #     fov_attrs["normalization"][i]["fov_statistics"]["iqr"]
-        #     for i in channel_names
-        # ]
-        # means = [
-        #     fov_attrs["normalization"][i]["fov_statistics"]["mean"]
-        #     for i in channel_names
-        # ]
-
-        # data_bg_sub = np.clip(data - np.expand_dims(bg, (1, 2)), a_min=0, a_max=None)
-
-        # if self.cell_masks:
-        #     data_bg_sub = data_bg_sub * masks
-
-        # data_iqr = (data_bg_sub - np.expand_dims(means, (1, 2))) / (
-        #     np.expand_dims(iqrs, (1, 2)) + 1e-6
-        # )
-
-        # # TODO: Need to fix to work with multiple channels
-        # lo, hi = np.percentile(data_iqr, [1, 99.5])
-        # scale = max(abs(lo), abs(hi))   # symmetric mapping
-        # data_norm = np.clip(data_iqr, -scale, scale) / scale
+    def _normalize_data(self, channel_names, data):
+        img_list = []
+        for ch in channel_names:
+            print(ch)
+            if ch == "Phase2D":
+                img_list.append(data[0])
+            else:
+                # apply log normalization
+                img = data[channel_names.index(ch)]
+                log_img = np.log1p(img)
+                img_norm = (log_img - log_img.mean()) / log_img.std()
+                img_list.append(img_norm)
+
+        data_norm = np.stack(img_list, axis=0)
 
         return data_norm
 
@@ -282,9 +256,7 @@ def __getitem__(self, index):
         gene_label = self.label_int_lut[ci.gene_name]
         total_index = ci.total_index
 
-        channel_names, channel_index = self._get_channels(
-            ci, well
-        )  # probably doesn't have to be done per dataset
+        channel_names, channel_index = self._get_channels(ci, well)
 
         data = np.asarray(
             fov[0, channel_index, 0, slice(bbox[0], bbox[2]), slice(bbox[1], bbox[3])]
@@ -295,7 +267,10 @@ def __getitem__(self, index):
         ).copy()
         sc_mask = mask == ci.segmentation_id
 
-        data_norm = self._normalize_data(ci, channel_names, data, sc_mask)
+        data_norm = self._normalize_data(channel_names, data)
+
+        if self.cell_masks:
+            data_norm = data_norm * sc_mask
 
         batch = {
             "data": data_norm.astype(np.float32),
@@ -455,7 +430,7 @@ def construct_dataloaders(
         self,
         num_workers: int = 1,
         shuffle: bool = True,
-        dataset_type: Literal["basic", "triplet"] = "basic",
+        dataset_type: Literal["basic", "triplet", "cell_profile"] = "basic",
         triplet_kwargs: dict = None,
         basic_kwargs: dict = None,
         cp_kwargs: dict = None,
diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
@@ -4,7 +4,7 @@
 
 
 @pytest.fixture(scope="module")
-def data_manager():
+def feature_data_manager():
     """Create data manager for testing (reused across all tests in module)."""
     experiment_dict = {"ops0033_20250429": ["A/1/0", "A/2/0", "A/3/0"]}
     dm = data_loader.OpsDataManager(
@@ -20,13 +20,36 @@ def data_manager():
 
 
 @pytest.fixture(scope="module")
-def batch(data_manager):
+def basic_data_manager():
+    """Create data manager for testing (reused across all tests in module)."""
+    experiment_dict = {"ops0033_20250429": ["A/1/0", "A/2/0", "A/3/0"]}
+    dm = data_loader.OpsDataManager(
+        experiments=experiment_dict,
+        batch_size=2,
+        data_split=(1, 0, 0),
+        out_channels=["Phase2D", "mCherry"],
+        initial_yx_patch_size=(256, 256),
+        verbose=False,
+    )
+    dm.construct_dataloaders(num_workers=1, dataset_type="basic")
+    return dm
+
+
+@pytest.fixture(scope="module")
+def feature_batch(feature_data_manager):
     """Get a single batch for testing (reused across all tests in module)."""
-    train_loader = data_manager.train_loader
+    train_loader = feature_data_manager.train_loader
     return next(iter(train_loader))
 
 
-def test_batch_keys_cellprofiler(batch):
+@pytest.fixture(scope="module")
+def basic_batch(basic_data_manager):
+    """Get a single batch for testing (reused across all tests in module)."""
+    train_loader = basic_data_manager.train_loader
+    return next(iter(train_loader))
+
+
+def test_batch_keys_cellprofiler(feature_batch):
     expected_keys = [
         "data",
         "cell_mask",
@@ -51,31 +74,84 @@ def test_batch_keys_cellprofiler(batch):
         "crop_info": list,
     }
 
-    batch_keys = list(batch.keys())
+    batch_keys = list(feature_batch.keys())
     for k, v in expected_keys.items():
         assert k in batch_keys
 
-        assert isinstance(batch[k], v)
+        assert isinstance(feature_batch[k], v)
+    return
+
+
+# Test that the data returned is normalized
+def test_data_normalization(basic_batch):
+    data = basic_batch["data"]
+    # compute mean over all but batch and channel dimensions
+    mean = torch.mean(data, dim=(0, 2, 3))
+
+    # assert that mean is approximately 0
+    assert torch.allclose(mean, torch.zeros_like(mean), atol=1e-1)
+
+    return
+
+
+# test that requesting different out channels works
+def test_out_channels(basic_data_manager, basic_batch):
+
+    shape = basic_batch["data"].shape
+    assert shape[1] == 2  # 2 out channels requested
+
+    basic_data_manager.out_channels = ["Phase2D"]
+    basic_data_manager.construct_dataloaders(num_workers=1, dataset_type="basic")
+    batch = next(iter(basic_data_manager.train_loader))
+    shape_1 = batch["data"].shape
+    assert shape_1[1] == 1  # 1 out channel requested
+
+    basic_data_manager.out_channels = ["mCherry"]
+    basic_data_manager.construct_dataloaders(num_workers=1, dataset_type="basic")
+    batch = next(iter(basic_data_manager.train_loader))
+    shape_2 = batch["data"].shape
+    assert shape_2[1] == 1  # 1 out channel requested
 
     return
 
 
-# def test_batch_keys_basic(batch):
+# Test that turning masking on/off works
+def test_cell_masking(feature_data_manager, feature_batch):
+
+    data = feature_batch["data"]
+    cell_mask = feature_batch["cell_mask"]
+    # assert that where cell_mask is 0, data is also 0
+    masked_data = data * (cell_mask == 0)
+    assert torch.sum(masked_data) == 0
 
-#     return
+    feature_data_manager.train_loader.dataset.use_cell_mask = False
+    batch = next(iter(feature_data_manager.train_loader))
+    data_no_mask = batch["data"]
+    # assert that data_no_mask is not equal to data everywhere
+    assert not torch.equal(data, data_no_mask)
 
+    return
 
-# def test_data_loader_consistancy(data_manager):
-#     dm, batch = data_manager
 
-#     new_data_manager, _ = create_data_manager()
+# Test that different patch sizes work
+def test_patch_size(basic_data_manager, basic_batch):
 
-#     batch_labels = batch["gene_label"].detach().cpu().numpy()
-#     total_indxs = batch["total_index"].detach().cpu().numpy()
+    shape = basic_batch["data"].shape
+    assert shape[2] == 128  # initial patch size
+    assert shape[3] == 128
 
-#     gene_names = dm.labels_df.iloc[total_indxs].gene_name.to_list()
-#     mapped_labels = np.asarray([new_data_manager.label_int_lut[a] for a in gene_names])
+    basic_data_manager.final_yx_patch_size = (256, 256)
+    basic_data_manager.construct_dataloaders(num_workers=1, dataset_type="basic")
+    batch = next(iter(basic_data_manager.train_loader))
+    shape_1 = batch["data"].shape
+    assert shape_1[2] == 256  # changed patch size
+    assert shape_1[3] == 256
 
-#     assert np.all(batch_labels == mapped_labels)
+    basic_data_manager.final_yx_patch_size = (64, 64)
+    basic_data_manager.construct_dataloaders(num_workers=1, dataset_type="basic")
+    batch = next(iter(basic_data_manager.train_loader))
+    shape_2 = batch["data"].shape
+    assert shape_2[2] == 64  # changed patch size again
+    assert shape_2[3] == 64
 
-#     return
+    return