refactor: simplified repo

Anna Grim · Anna Grim · commit 0e4e840c8943 · 2025-08-13T05:40:00.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
     'torchvision',
     'tqdm',
     'xarray_multiscale',
-    'zarr'
+    'zarr',
+    "aind-exaspim-image-utils @ git+https://github.com/AllenNeuralDynamics/aind-exaspim-dataset-utils.git@main"
 ]
 
 [project.optional-dependencies]
diff --git a/src/aind_exaspim_image_compression/inference.py b/src/aind_exaspim_image_compression/inference.py
@@ -4,18 +4,13 @@
 @author: Anna Grim
 @email: anna.grim@alleninstitute.org
 
-Denoising routines for 3D microscopy images using patch-based deep learning
-inference. Includes functions to extract overlapping patches, normalize and
-batch process them through a model on GPU, and stitch denoised patches back
-into a full 3D volume.
+Code for using BM4D-Net to denoise 3D micrscopy images. Includes routines to
+extract overlapping patches, normalize and batch process them through a model
+on GPU, and stitch denoised patches back into a full 3D volume.
 
 """
 
-from concurrent.futures import (
-    ThreadPoolExecutor,
-    as_completed,
-)
-from numcodecs import blosc
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
 
 import itertools
@@ -37,8 +32,8 @@ def predict(
     verbose=True
 ):
     """
-    Denoises a 3D image by processing patches in batches and running deep
-    learning model.
+    Denoises a 3D image by tiling it into overlapping patches, forming batches
+    of patches, and processing each batch through the given model.
 
     Parameters
     ----------
@@ -52,8 +47,11 @@ def predict(
         Size of the cubic patch extracted from the image. Default is 64.
     overlap : int, optional
         Number of voxels to overlap between patches. Default is 16.
+    trim : int, optional
+        Number of voxels from the image boundary that are set to zero to
+        suppress noisy edge predictions. Default is 5.
     verbose : bool, optional
-        Whether to show a tqdm progress bar. Default is True.
+        Whether to show a progress bar. Default is True.
 
     Returns
     -------
@@ -65,16 +63,16 @@ def predict(
         img = img[np.newaxis, ...]
 
     # Initializations
-    starts_generator = generate_patch_starts(img, patch_size, overlap)
+    patch_starts_generator = generate_patch_starts(img, patch_size, overlap)
     n_starts = count_patches(img, patch_size, overlap)
     if denoised is None:
         denoised = np.zeros_like(img, dtype=np.uint16)
 
     # Main
     pbar = tqdm(total=n_starts, desc="Denoise") if verbose else None
     for i in range(0, n_starts, batch_size):
-        # Run model
-        starts = list(itertools.islice(starts_generator, batch_size))
+        # Extract batch and run model
+        starts = list(itertools.islice(patch_starts_generator, batch_size))
         patches = _predict_batch(img, model, starts, patch_size, trim)
 
         # Store result
@@ -166,6 +164,7 @@ def read_patch(i):
         with torch.no_grad():
             outputs = model(inputs).cpu().squeeze(1).numpy()
 
+        # Process results
         N = outputs.shape[0]
         start, end = trim, patch_size - trim
         final_shape = (end - start,) * 3
diff --git a/src/aind_exaspim_image_compression/machine_learning/data_handling.py b/src/aind_exaspim_image_compression/machine_learning/data_handling.py
@@ -1,5 +1,5 @@
 """
-Created on Thu Dec 5 14:00:00 2024
+Created on Jan 3 12:30:00 2025
 
 @author: Anna Grim
 @email: anna.grim@alleninstitute.org
@@ -9,7 +9,7 @@
 """
 
 from abc import ABC, abstractmethod
-from careamics.transforms.n2v_manipulate import N2VManipulate
+from aind_exaspim_dataset_utils.s3_util import get_img_prefix
 from concurrent.futures import (
     ProcessPoolExecutor,
     ThreadPoolExecutor,
@@ -24,15 +24,16 @@
 import torch
 
 from aind_exaspim_image_compression.utils import img_util, util
+from aind_exaspim_image_compression.utils.img_util import BM4D
 from aind_exaspim_image_compression.utils.swc_util import Reader
 
 
 # --- Custom Datasets ---
 class TrainDataset(Dataset):
+
     def __init__(
         self,
         patch_shape,
-        transform,
         anisotropy=(0.748, 0.748, 1.0),
         boundary_buffer=4000,
         foreground_sampling_rate=0.5,
@@ -43,15 +44,19 @@ def __init__(
         # Class attributes
         self.anisotropy = anisotropy
         self.boundary_buffer = boundary_buffer
+        self.denoise_bm4d = BM4D()
         self.foreground_sampling_rate = foreground_sampling_rate
         self.patch_shape = patch_shape
         self.swc_reader = Reader()
-        self.transform = transform
+
+        # Ground truth denoising
+        
 
         # Data structures
         self.foreground = dict()
         self.imgs = dict()
 
+    # --- Ingest data ---
     def ingest_img(self, brain_id, img_path, swc_pointer):
         self.foreground[brain_id] = self.ingest_swcs(swc_pointer)
         self.imgs[brain_id] = img_util.read(img_path)
@@ -73,25 +78,11 @@ def ingest_swcs(self, swc_pointer):
                 return foreground
         return set()
 
-    def __len__(self):
-        """
-        Counts the number of whole-brain images in the dataset.
-
-        Parameters
-        ----------
-        None
-
-        Returns
-        -------
-        int
-            Number of whole-brain images in the dataset.
-        """
-        return len(self.imgs)
-
+    # --- Core Routines ---
     def __getitem__(self, dummy_input):
         brain_id = self.sample_brain()
         voxel = self.sample_voxel(brain_id)
-        return self.transform(self.get_patch(brain_id, voxel))
+        return self.denoise_bm4d(self.get_patch(brain_id, voxel))
 
     def sample_brain(self):
         return util.sample_once(self.imgs.keys())
@@ -110,6 +101,21 @@ def sample_voxel(self, brain_id):
             return tuple(voxel)
 
     # --- Helpers ---
+    def __len__(self):
+        """
+        Counts the number of whole-brain images in the dataset.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        int
+            Number of whole-brain images in the dataset.
+        """
+        return len(self.imgs)
+
     def get_patch(self, brain_id, voxel):
         s, e = img_util.get_start_end(voxel, self.patch_shape)
         return self.imgs[brain_id][0, 0, s[0]: e[0], s[1]: e[1], s[2]: e[2]]
@@ -124,13 +130,14 @@ def update_foreground_sampling_rate(self, foreground_sampling_rate):
 
 
 class ValidateDataset(Dataset):
-    def __init__(self, patch_shape, transform):
+
+    def __init__(self, patch_shape):
         # Call parent class
         super(ValidateDataset, self).__init__()
 
         # Instance attributes
         self.patch_shape = patch_shape
-        self.transform = transform
+        self.denoise_bm4d = BM4D()
 
         # Data structures
         self.ids = list()
@@ -159,7 +166,7 @@ def ingest_img(self, brain_id, img_path):
 
     def ingest_example(self, brain_id, voxel):
         # Get clean image
-        noise, denoised, mn_mx = self.transform(
+        noise, denoised, mn_mx = self.denoise_bm4d(
             self.get_patch(brain_id, voxel)
         )
 
@@ -203,6 +210,7 @@ def __init__(self, dataset, batch_size=16):
         -------
         None
         """
+        # Instance attributes
         self.dataset = dataset
         self.batch_size = batch_size
         self.patch_shape = dataset.patch_shape
@@ -232,43 +240,7 @@ def _load_batch(self, idx):
         pass
 
 
-class TrainN2VDataLoader(DataLoader):
-    """
-    DataLoader that uses multithreading to fetch image patches from the cloud
-    to form batches to train Noise2Void (N2V).
-    """
-
-    def __init__(self, dataset, batch_size=16, n_upds=100):
-        # Call parent class
-        super().__init__(dataset, batch_size)
-
-        # Instance attributes
-        self.n_upds = n_upds
-
-    def _get_iterator(self):
-        return range(self.n_upds)
-
-    def _load_batch(self, dummy_input):
-        with ThreadPoolExecutor() as executor:
-            # Assign threads
-            threads = list()
-            for _ in range(self.batch_size):
-                threads.append(executor.submit(self.dataset.__getitem__, -1))
-
-            # Process results
-            shape = (self.batch_size, 1,) + self.patch_shape
-            masked_patches = np.zeros(shape)
-            patches = np.zeros(shape)
-            masks = np.zeros(shape)
-            for i, thread in enumerate(as_completed(threads)):
-                masked_patch, patch, mask = thread.result()
-                masked_patches[i, 0, ...] = masked_patch
-                patches[i, 0, ...] = patch
-                masks[i, 0, ...] = mask
-        return to_tensor(masked_patches), to_tensor(patches), to_tensor(masks)
-
-
-class TrainBM4DDataLoader(DataLoader):
+class TrainDataLoader(DataLoader):
     """
     DataLoader that uses multithreading to fetch image patches from the cloud
     to form batches.
@@ -282,8 +254,11 @@ def __init__(self, dataset, batch_size=8, n_upds=20):
         ----------
         dataset : Dataset.ProposalDataset
             Instance of custom dataset.
-        batch_size : int
-            Number of samples per batch.
+        batch_size : int, optional
+            Number of samples per batch. Default is 8.
+        n_upds : int, optional
+            Number of back propagation gradient updates before validating the
+            model. Default is 20.
 
         Returns
         -------
@@ -316,45 +291,7 @@ def _load_batch(self, dummy_input):
         return to_tensor(noise_patches), to_tensor(clean_patches), None
 
 
-class ValidateN2VDataLoader(DataLoader):
-    """
-    DataLoader that uses multithreading to fetch image patches from the cloud
-    to form batches.
-    """
-
-    def __init__(self, dataset, batch_size=8):
-        super().__init__(dataset, batch_size)
-
-    def _get_iterator(self):
-        return range(0, len(self.dataset), self.batch_size)
-
-    def _load_batch(self, start_idx):
-        # Compute batch size
-        n_remaining_examples = len(self.dataset) - start_idx
-        batch_size = min(self.batch_size, n_remaining_examples)
-
-        # Generate batch
-        with ThreadPoolExecutor() as executor:
-            # Assign threads
-            threads = list()
-            for idx_shift in range(batch_size):
-                idx = start_idx + idx_shift
-                threads.append(executor.submit(self.dataset.__getitem__, idx))
-
-            # Process results
-            shape = (batch_size, 1,) + self.patch_shape
-            masked_patches = np.zeros(shape)
-            patches = np.zeros(shape)
-            masks = np.zeros(shape)
-            for i, thread in enumerate(as_completed(threads)):
-                masked_patch, patch, mask = thread.result()
-                masked_patches[i, 0, ...] = masked_patch
-                patches[i, 0, ...] = patch
-                masks[i, 0, ...] = mask
-        return to_tensor(masked_patches), to_tensor(patches), to_tensor(masks)
-
-
-class ValidateBM4DDataLoader(DataLoader):
+class ValidateDataLoader(DataLoader):
     """
     DataLoader that uses multiprocessing to fetch image patches from the cloud
     to form batches.
@@ -399,30 +336,30 @@ def init_datasets(
     brain_ids,
     img_paths_json,
     patch_shape,
-    n_validate_examples,
     foreground_sampling_rate=0.5,
-    method="bm4d",
+    n_validate_examples=0,
     swc_dict=None
 ):
     # Initializations
-    transform = N2VManipulate() if method == "n2v" else img_util.BM4D()
     train_dataset = TrainDataset(
-        patch_shape,
-        transform,
-        foreground_sampling_rate=foreground_sampling_rate,
+        patch_shape, foreground_sampling_rate=foreground_sampling_rate,
     )
-    val_dataset = ValidateDataset(patch_shape, transform)
+    val_dataset = ValidateDataset(patch_shape)
 
     # Load data
     for brain_id in tqdm(brain_ids, desc="Load Data"):
-        img_path = img_util.get_img_prefix(brain_id, img_paths_json)
+        # Set image path
+        img_path = get_img_prefix(brain_id, img_paths_json)
         img_path += str(0)
+
+        # Set SWC path
         if swc_dict:
             swc_pointer = deepcopy(swc_dict)
             swc_pointer["path"] += f"/{brain_id}/world"
         else:
             swc_pointer = None
 
+        # Ingest data
         train_dataset.ingest_img(brain_id, img_path, swc_pointer)
         val_dataset.ingest_img(brain_id, img_path)
 
@@ -436,7 +373,7 @@ def init_datasets(
 
 def to_tensor(arr):
     """
-    Converts a numpy array to a torch tensor.
+    Converts the given numpy array to a torch tensor.
 
     Parameters
     ----------

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@ dependencies = [`
`32`	`32`	`'torchvision',`
`33`	`33`	`'tqdm',`
`34`	`34`	`'xarray_multiscale',`
`35`		`- 'zarr'`
	`35`	`+ 'zarr',`
	`36`	`+ "aind-exaspim-image-utils @ git+https://github.com/AllenNeuralDynamics/aind-exaspim-dataset-utils.git@main"`
`36`	`37`	`]`
`37`	`38`
`38`	`39`	`[project.optional-dependencies]`