computational-cell-analytics
diff --git a/‎flamingo_tools/segmentation/postprocessing.py‎
Lines changed: 73 additions & 9 deletions b/‎flamingo_tools/segmentation/postprocessing.py‎
Lines changed: 73 additions & 9 deletions
diff --git a/‎flamingo_tools/segmentation/unet_prediction.py‎
Lines changed: 60 additions & 8 deletions b/‎flamingo_tools/segmentation/unet_prediction.py‎
Lines changed: 60 additions & 8 deletions
diff --git a/‎scripts/extract_block.py‎
Lines changed: 25 additions & 23 deletions b/‎scripts/extract_block.py‎
Lines changed: 25 additions & 23 deletions
@@ -1,16 +1,55 @@
 import numpy as np
 import vigra
+import multiprocessing as mp
+from concurrent import futures
 
 from skimage import measure
 from scipy.spatial import distance
 from scipy.sparse import csr_matrix
+from tqdm import tqdm
 
+import elf.parallel as parallel
+from elf.io import open_file
+import nifty.tools as nt
 
-def filter_isolated_objects(segmentation, distance_threshold=15, neighbor_threshold=5):
-    segmentation, n_ids, _ = vigra.analysis.relabelConsecutive(segmentation, start_label=1, keep_zeros=True)
+def filter_isolated_objects(
+        segmentation, output_path, tsv_table=None,
+        distance_threshold=15, neighbor_threshold=5, min_size=1000,
+        output_key="segmentation_postprocessed",
+        ):
+    """
+    Postprocessing step to filter isolated objects from a segmentation.
+    Instance segmentations are filtered if they have fewer neighbors than a given threshold in a given distance around them.
+    Additionally, size filtering is possible if a TSV file is supplied.
 
-    props = measure.regionprops(segmentation)
-    coordinates = np.array([prop.centroid for prop in props])
+    :param dataset segmentation: Dataset containing the segmentation
+    :param str out_path: Output path for postprocessed segmentation
+    :param str tsv_file: Optional TSV file containing segmentation parameters in MoBIE format
+    :param int distance_threshold: Distance in micrometer to check for neighbors
+    :param int neighbor_threshold: Minimal number of neighbors for filtering
+    :param int min_size: Minimal number of pixels for filtering small instances
+    :param str output_key: Output key for postprocessed segmentation
+    """
+    if tsv_table is not None:
+        n_pixels = tsv_table["n_pixels"].to_list()
+        label_ids = tsv_table["label_id"].to_list()
+        centroids = list(zip(tsv_table["anchor_x"], tsv_table["anchor_y"], tsv_table["anchor_z"]))
+        n_ids = len(label_ids)
+
+        # filter out cells smaller than min_size
+        if min_size is not None:
+            min_size_label_ids = [l for (l,n) in zip(label_ids, n_pixels) if n <= min_size]
+            centroids = [c for (c,l) in zip(centroids, label_ids) if l not in min_size_label_ids]
+            label_ids = [int(l) for l in label_ids if l not in min_size_label_ids]
+
+        coordinates = np.array(centroids)
+        label_ids = np.array(label_ids)
+
+    else:
+        segmentation, n_ids, _ = vigra.analysis.relabelConsecutive(segmentation[:], start_label=1, keep_zeros=True)
+        props = measure.regionprops(segmentation)
+        coordinates = np.array([prop.centroid for prop in props])
+        label_ids = np.unique(segmentation)[1:]
 
     # Calculate pairwise distances and convert to a square matrix
     dist_matrix = distance.pdist(coordinates)
@@ -22,13 +61,38 @@ def filter_isolated_objects(segmentation, distance_threshold=15, neighbor_thresh
     # Sum each row to count neighbors
     neighbor_counts = sparse_matrix.sum(axis=1)
 
-    seg_ids = np.unique(segmentation)[1:]
     filter_mask = np.array(neighbor_counts < neighbor_threshold).squeeze()
-    filter_ids = seg_ids[filter_mask]
+    filter_ids = label_ids[filter_mask]
+
+    shape = segmentation.shape
+    block_shape=(128,128,128)
+    chunks=(128,128,128)
+
+    blocking = nt.blocking([0] * len(shape), shape, block_shape)
+
+    output = open_file(output_path, mode="a")
+
+    output_dataset = output.create_dataset(
+        output_key, shape=shape, dtype=segmentation.dtype,
+        chunks=chunks, compression="gzip"
+    )
+
+    def filter_chunk(block_id):
+        """
+        Set all points within a chunk to zero if they match filter IDs.
+        """
+        block = blocking.getBlock(block_id)
+        volume_index = tuple(slice(beg, end) for beg, end in zip(block.begin, block.end))
+        data = segmentation[volume_index]
+        data[np.isin(data, filter_ids)] = 0
+        output_dataset[volume_index] = data
+
+    # Limit the number of cores for parallelization.
+    n_threads = min(16, mp.cpu_count())
 
-    seg_filtered = segmentation.copy()
-    seg_filtered[np.isin(seg_filtered, filter_ids)] = 0
+    with futures.ThreadPoolExecutor(n_threads) as filter_pool:
+        list(tqdm(filter_pool.map(filter_chunk, range(blocking.numberOfBlocks)), total=blocking.numberOfBlocks))
 
-    seg_filtered, n_ids_filtered, _ = vigra.analysis.relabelConsecutive(seg_filtered, start_label=1, keep_zeros=True)
+    seg_filtered, n_ids_filtered, _ = parallel.relabel_consecutive(output_dataset, start_label=1, keep_zeros=True, block_shape=(128,128,128))
 
     return seg_filtered, n_ids, n_ids_filtered
@@ -1,5 +1,6 @@
 import multiprocessing as mp
 import os
+import sys
 import warnings
 from concurrent import futures
 
@@ -10,6 +11,7 @@
 import vigra
 import torch
 import z5py
+import zarr
 import json
 
 from elf.wrapper import ThresholdWrapper, SimpleTransformationWrapper
@@ -18,6 +20,10 @@
 from torch_em.util import load_model
 from torch_em.util.prediction import predict_with_halo
 from tqdm import tqdm
+from inspect import getsourcefile
+
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(getsourcefile(lambda:0)))), "scripts", "prediction"))
+import upload_to_s3
 
 """
 Prediction using distance U-Net.
@@ -43,7 +49,7 @@ def ndim(self):
         return self._volume.ndim - 1
 
 
-def prediction_impl(input_path, input_key, output_folder, model_path, scale, block_shape, halo, prediction_instances=1, slurm_task_id=0, mean=None, std=None):
+def prediction_impl(input_path, input_key, output_folder, model_path, scale, block_shape, halo, prediction_instances=1, slurm_task_id=0, mean=None, std=None, s3=None):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         if os.path.isdir(model_path):
@@ -56,6 +62,9 @@ def prediction_impl(input_path, input_key, output_folder, model_path, scale, blo
 
     if input_key is None:
         input_ = imageio.imread(input_path)
+    elif s3 is not None:
+        with zarr.open(input_path, mode="r") as f:
+            input_ = f[input_key]
     else:
         input_ = open_file(input_path, "r")[input_key]
 
@@ -138,7 +147,7 @@ def postprocess(x):
     return original_shape
 
 
-def find_mask(input_path, input_key, output_folder):
+def find_mask(input_path, input_key, output_folder, s3=None):
     mask_path = os.path.join(output_folder, "mask.zarr")
     f = z5py.File(mask_path, "a")
 
@@ -149,6 +158,10 @@ def find_mask(input_path, input_key, output_folder):
     if input_key is None:
         raw = imageio.imread(input_path)
         chunks = (64, 64, 64)
+    elif s3 is not None:
+        with zarr.open(input_path, mode="r") as fin:
+            raw = fin[input_key]
+        chunks = raw.chunks
     else:
         fin = open_file(input_path, "r")
         raw = fin[input_key]
@@ -243,7 +256,10 @@ def write_block(block_id):
                 tp.map(write_block, range(blocking.numberOfBlocks))
 
 
-def calc_mean_and_std(input_path, input_key, output_folder):
+def calc_mean_and_std(
+    input_path, input_key, output_folder,
+    s3=None,
+    ):
     """
     Calculate mean and standard deviation of full volume.
     Parameters are saved in 'mean_std.json' within the output folder.
@@ -254,6 +270,9 @@ def calc_mean_and_std(input_path, input_key, output_folder):
 
     if input_key is None:
         input_ = imageio.imread(input_path)
+    elif s3 is not None:
+        with zarr.open(input_path, mode="r") as f:
+            input_ = f[input_key]
     else:
         input_ = open_file(input_path, "r")[input_key]
 
@@ -267,6 +286,7 @@ def calc_mean_and_std(input_path, input_key, output_folder):
     with open(json_file, "w") as f:
         json.dump(ddict, f)
 
+
 def run_unet_prediction(
     input_path, input_key,
     output_folder, model_path,
@@ -288,32 +308,63 @@ def run_unet_prediction(
 
 def run_unet_prediction_preprocess_slurm(
         input_path, input_key, output_folder,
+        s3=None, s3_bucket_name=None, s3_service_endpoint=None, s3_credentials=None,
 ):
     """
     Pre-processing for the parallel prediction with U-Net models.
     Masks are stored in mask.zarr in the output folder.
     The mean and standard deviation are precomputed for later usage during prediction
-    and stored in a JSON file within the output folder as mean_std.json
+    and stored in a JSON file within the output folder as mean_std.json.
     """
-    find_mask(input_path, input_key, output_folder)
-    calc_mean_and_std(input_path, input_key, output_folder)
+    if s3 is not None:
+        bucket_name, service_endpoint, credentials = upload_to_s3.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
+
+        input_path, fs = upload_to_s3.get_s3_path(input_path, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+
+    if not os.path.isdir(os.path.join(output_folder, "mask.zarr")):
+        find_mask(input_path, input_key, output_folder, s3=s3)
+
+    calc_mean_and_std(input_path, input_key, output_folder, s3=s3)
+
 
 def run_unet_prediction_slurm(
     input_path, input_key, output_folder, model_path,
     scale=None,
     block_shape=None, halo=None, prediction_instances=1,
+    s3=None, s3_bucket_name=None, s3_service_endpoint=None, s3_credentials=None,
 ):
+    """
+    Run prediction of distance U-Net for data stored locally or on an S3 bucket.
+
+    :param str input_path: File path to input data
+    :param str input_key: Input key for data in ome.zarr format
+    :param str output_folder: Output folder for prediction.zarr
+    :param str model_path: File path to distance U-Net model
+    :param float scale:
+    :param tuple block_shape:
+    :param tuple halo:
+    :param int prediction_instances: Number of workers for parallel computation within slurm array
+    :param bool s3: Flag for accessing data on S3 bucket
+    :param str s3_bucket_name: S3 bucket name. Optional if BUCKET_NAME has been exported
+    :param str s3_service_endpoint: S3 service endpoint. Optional if SERVICE_ENDPOINT has been exported
+    :param str s3_credentials: Path to file containing S3 credentials
+    """
     os.makedirs(output_folder, exist_ok=True)
     prediction_instances = int(prediction_instances)
     slurm_task_id = os.environ.get("SLURM_ARRAY_TASK_ID")
 
+    if s3 is not None:
+        bucket_name, service_endpoint, credentials = upload_to_s3.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
+
+        input_path, fs = upload_to_s3.get_s3_path(input_path, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+
     if slurm_task_id is not None:
         slurm_task_id = int(slurm_task_id)
     else:
         raise ValueError("The SLURM_ARRAY_TASK_ID is not set. Ensure that you are using the '-a' option with SBATCH.")
 
     if not os.path.isdir(os.path.join(output_folder, "mask.zarr")):
-        find_mask(input_path, input_key, output_folder)
+        find_mask(input_path, input_key, output_folder, s3=s3)
 
     # get pre-computed mean and standard deviation of full volume from JSON file
     if os.path.isfile(os.path.join(output_folder, "mean_std.json")):
@@ -328,9 +379,10 @@ def run_unet_prediction_slurm(
     original_shape = prediction_impl(
         input_path, input_key, output_folder, model_path, scale, block_shape, halo,
         prediction_instances=prediction_instances, slurm_task_id=slurm_task_id,
-        mean=mean, std=std,
+        mean=mean, std=std, s3=s3,
     )
 
+
 # does NOT need GPU, FIXME: only run on CPU
 def run_unet_segmentation_slurm(output_folder, min_size):
     min_size = int(min_size)
 
@@ -1,10 +1,14 @@
 import os
+import sys
 import argparse
 import numpy as np
 import z5py
 import zarr
 
-import s3fs
+from inspect import getsourcefile
+
+sys.path.append(os.path.join(os.path.dirname(getsourcefile(lambda:0)), "prediction"))
+import upload_to_s3
 
 """
 This script extracts data around an input center coordinate in a given ROI halo.
@@ -18,7 +22,10 @@
 """
 
 
-def main(input_file, output_dir, input_key, resolution, coords, roi_halo, s3):
+def main(
+    input_file, output_dir, coords, input_key, resolution, roi_halo,
+    s3, s3_credentials, s3_bucket_name, s3_service_endpoint,
+    ):
     """
 
     :param str input_file: File path to input folder in n5 format
@@ -28,6 +35,9 @@ def main(input_file, output_dir, input_key, resolution, coords, roi_halo, s3):
     :param str coords: Center coordinates of extracted 3D volume in format 'x,y,z'
     :param str roi_halo: ROI halo of extracted 3D volume in format 'x,y,z'
     :param bool s3: Flag for using an S3 bucket
+    :param str s3_credentials: Path to file containing S3 credentials
+    :param str s3_bucket_name: S3 bucket name. Optional if BUCKET_NAME has been exported
+    :param str s3_service_endpoint: S3 service endpoint. Optional if SERVICE_ENDPOINT has been exported
     """
 
     coords =  [int(r) for r in coords.split(",")]
@@ -61,33 +71,18 @@ def main(input_file, output_dir, input_key, resolution, coords, roi_halo, s3):
     roi = tuple(slice(co - rh, co + rh) for co, rh in zip(coords, roi_halo))
 
     if s3:
+        bucket_name, service_endpoint, credentials = upload_to_s3.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
 
-        # Define S3 bucket and OME-Zarr dataset path
-
-        bucket_name = "cochlea-lightsheet"
-        zarr_path = f"{bucket_name}/{input_file}"
-
-        # Create an S3 filesystem
-        fs = s3fs.S3FileSystem(
-            client_kwargs={"endpoint_url": "https://s3.fs.gwdg.de"},
-            anon=False
-        )
+        s3_path, fs = upload_to_s3.get_s3_path(input_file, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
 
-        if not fs.exists(zarr_path):
-            print("Error: Path does not exist!")
-
-        # Open the OME-Zarr dataset
-        store = zarr.storage.FSStore(zarr_path, fs=fs)
-        print(f"Opening file {zarr_path} from the S3 bucket.")
-
-        with zarr.open(store, mode="r") as f:
+        with zarr.open(s3_path, mode="r") as f:
             raw = f[input_key][roi]
 
     else:
-        with z5py.File(input_file, "r") as f:
+        with zarr.open(input_file, mode="r") as f:
             raw = f[input_key][roi]
 
-    with z5py.File(output_file, "w") as f_out:
+    with zarr.open(output_file, mode="w") as f_out:
         f_out.create_dataset("raw", data=raw, compression="gzip")
 
 if __name__ == "__main__":
@@ -103,8 +98,15 @@ def main(input_file, output_dir, input_key, resolution, coords, roi_halo, s3):
     parser.add_argument('-r', "--resolution", type=float, default=0.38, help="Resolution of input in micrometer")
 
     parser.add_argument("--roi_halo", type=str, default="128,128,64", help="ROI halo around center coordinate in format 'x,y,z'")
+
     parser.add_argument("--s3", action="store_true", help="Use S3 bucket")
+    parser.add_argument("--s3_credentials", default=None, help="Input file containing S3 credentials")
+    parser.add_argument("--s3_bucket_name", default=None, help="S3 bucket name")
+    parser.add_argument("--s3_service_endpoint", default=None, help="S3 service endpoint")
 
     args = parser.parse_args()
 
-    main(args.input, args.output, args.input_key, args.resolution, args.coord, args.roi_halo, args.s3)
+    main(
+        args.input, args.output, args.coord, args.input_key, args.resolution, args.roi_halo,
+        args.s3, args.s3_credentials, args.s3_bucket_name, args.s3_service_endpoint,
+    )