option to calc surface dice per component

SarahMuth · SarahMuth · commit f17c349cff4b · 2025-07-03T19:29:41.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,5 @@ scripts/cooper/training/find_rec_testset.py
 synapse-net-models/
 scripts/portal/upscale_tomo.py
 analysis_results/
-scripts/cooper/revision/evaluation_results/
+scripts/cooper/revision/evaluation_results/
+scripts/cooper/revision/export_tif_to_h5.py
diff --git a/run_sbatch_revision.sbatch b/run_sbatch_revision.sbatch
@@ -1,12 +1,16 @@
 #! /bin/bash
 #SBATCH -c 4 #4 #8
-#SBATCH --mem 120G #120G #32G #64G #256G
+#SBATCH --mem 256G #120G #32G #64G #256G
 #SBATCH -p grete:shared #grete:shared #grete-h100:shared
-#SBATCH -t 4:00:00 #6:00:00 #48:00:00 
+#SBATCH -t 6:00:00 #6:00:00 #48:00:00 
 #SBATCH -G A100:1 #V100:1 #2 #A100:1  #gtx1080:2 #v100:1 #H100:1
 #SBATCH --output=/user/muth9/u12095/synapse-net/slurm_revision/slurm-%j.out 
-#SBATCH -A nim00007 #SBATCH --constraint 80gb 
+#SBATCH -A nim00007 
+#SBATCH --constraint 80gb 
 
 source ~/.bashrc
 conda activate synapse-net
-python scripts/cooper/revision/surface_dice.py -i /mnt/ceph-hdd/cold/nim00007/AZ_prediction_new/endbulb_of_held/ -gt /mnt/ceph-hdd/cold/nim00007/new_AZ_train_data/endbulb_of_held/ -v 7
+python /user/muth9/u12095/synapse-net/scripts/cooper/revision/updated_data_analysis/run_data_analysis.py \
+ -i /mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/20241102_TOMO_DATA_Imig2014/exported/SNAP25/ \
+ -o /mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/20241102_TOMO_DATA_Imig2014/afterRevision_analysis/boundaryT0_9_constantins_presynapticFiltering --store \
+ -s ./analysis_results/man_subset
diff --git a/scripts/cooper/revision/surface_dice.py b/scripts/cooper/revision/surface_dice.py
@@ -11,20 +11,20 @@
 import pandas as pd
 from tqdm import tqdm
 import numpy as np
+from scipy.ndimage import label
+from skimage.measure import regionprops
 
 from membrain_seg.segmentation.skeletonize import skeletonization
 from membrain_seg.benchmark.metrics import masked_surface_dice
 
 
 def load_segmentation(file_path, key):
-    """Load a dataset from an HDF5 file."""
     with h5py.File(file_path, "r") as f:
         data = f[key][:]
     return data
 
 
 def evaluate_surface_dice(pred, gt, raw, check):
-    """Skeletonize predictions and GT, compute surface dice."""
     gt_skeleton = skeletonization(gt == 1, batch_size=100000)
     pred_skeleton = skeletonization(pred, batch_size=100000)
     mask = gt != 2
@@ -33,10 +33,10 @@ def evaluate_surface_dice(pred, gt, raw, check):
         import napari
         v = napari.Viewer()
         v.add_image(raw)
-        v.add_labels(gt, name= f"gt")
-        v.add_labels(gt_skeleton.astype(np.uint16), name= f"gt_skeleton")
-        v.add_labels(pred, name= f"pred")
-        v.add_labels(pred_skeleton.astype(np.uint16), name= f"pred_skeleton")
+        v.add_labels(gt, name="gt")
+        v.add_labels(gt_skeleton.astype(np.uint16), name="gt_skeleton")
+        v.add_labels(pred, name="pred")
+        v.add_labels(pred_skeleton.astype(np.uint16), name="pred_skeleton")
         napari.run()
 
     surf_dice, confusion_dict = masked_surface_dice(
@@ -45,28 +45,80 @@ def evaluate_surface_dice(pred, gt, raw, check):
     return surf_dice, confusion_dict
 
 
-def process_file(pred_path, gt_path, seg_key, gt_key, check):
-    """Process a single prediction/GT file pair."""
+def process_file(pred_path, gt_path, seg_key, gt_key, check,
+                 min_bb_shape=(32, 384, 384), min_thinning_size=2500,
+                 global_eval=False):
     try:
         pred = load_segmentation(pred_path, seg_key)
         gt = load_segmentation(gt_path, gt_key)
         raw = load_segmentation(gt_path, "raw")
-        surf_dice, confusion = evaluate_surface_dice(pred, gt, raw, check)
 
-        result = {
-            "tomo_name": os.path.basename(pred_path),
-            "surface_dice": surf_dice,
-            **confusion,
-        }
-        return result
+        if global_eval:
+            gt_bin = (gt == 1).astype(np.uint8)
+            pred_bin = pred.astype(np.uint8)
+
+            dice, confusion = evaluate_surface_dice(pred_bin, gt_bin, raw, check)
+            return [{
+                "tomo_name": os.path.basename(pred_path),
+                "gt_component_id": -1,  # -1 indicates global eval
+                "surface_dice": dice,
+                **confusion
+            }]
+
+        labeled_gt, _ = label(gt == 1)
+        props = regionprops(labeled_gt)
+        results = []
+
+        for prop in props:
+            if prop.area < min_thinning_size:
+                continue
+
+            comp_id = prop.label
+            bbox_start = prop.bbox[:3]
+            bbox_end = prop.bbox[3:]
+            bbox = tuple(slice(start, stop) for start, stop in zip(bbox_start, bbox_end))
+
+            pad_width = [
+                max(min_shape - (sl.stop - sl.start), 0) // 2
+                for sl, min_shape in zip(bbox, min_bb_shape)
+            ]
+
+            expanded_bbox = tuple(
+                slice(
+                    max(sl.start - pw, 0),
+                    min(sl.stop + pw, dim)
+                )
+                for sl, pw, dim in zip(bbox, pad_width, gt.shape)
+            )
+
+            gt_crop = (labeled_gt[expanded_bbox] == comp_id).astype(np.uint8)
+            pred_crop = pred[expanded_bbox].astype(np.uint8)
+            raw_crop = raw[expanded_bbox]
+
+            try:
+                dice, confusion = evaluate_surface_dice(pred_crop, gt_crop, raw_crop, check)
+            except Exception as e:
+                print(f"Error computing Dice for GT component {comp_id} in {pred_path}: {e}")
+                continue
+
+            result = {
+                "tomo_name": os.path.basename(pred_path),
+                "gt_component_id": comp_id,
+                "surface_dice": dice,
+                **confusion
+            }
+            results.append(result)
+
+        return results
 
     except Exception as e:
         print(f"Error processing {pred_path}: {e}")
-        return None
+        return []
 
 
-def collect_results(input_folder, gt_folder, version, check=False):
-    """Loop through prediction files and compute metrics."""
+def collect_results(input_folder, gt_folder, version, check=False,
+                    min_bb_shape=(32, 384, 384), min_thinning_size=2500,
+                    global_eval=False):
     results = []
     seg_key = f"predictions/az/seg_v{version}"
     gt_key = "/labels/az_merged"
@@ -83,29 +135,32 @@ def collect_results(input_folder, gt_folder, version, check=False):
             print(f"Warning: Ground truth file not found for {fname}")
             continue
 
-        result = process_file(pred_path, gt_path, seg_key, gt_key, check)
-        if result:
-            result["input_folder"] = input_folder_name
-            results.append(result)
+        file_results = process_file(
+            pred_path, gt_path, seg_key, gt_key, check,
+            min_bb_shape=min_bb_shape,
+            min_thinning_size=min_thinning_size,
+            global_eval=global_eval
+        )
+
+        for res in file_results:
+            res["input_folder"] = input_folder_name
+            results.append(res)
 
     return results
 
 
 def save_results(results, output_file):
-    """Append results to an Excel file, updating rows with matching tomo_name and input_folder."""
     new_df = pd.DataFrame(results)
 
     if os.path.exists(output_file):
         existing_df = pd.read_excel(output_file)
 
-        # Drop rows where tomo_name and input_folder match any in new_df
         combined_df = existing_df[
-            ~existing_df.set_index(["tomo_name", "input_folder"]).index.isin(
-                new_df.set_index(["tomo_name", "input_folder"]).index
+            ~existing_df.set_index(["tomo_name", "input_folder", "gt_component_id"]).index.isin(
+                new_df.set_index(["tomo_name", "input_folder", "gt_component_id"]).index
             )
         ]
 
-        # Append new data and reset index
         final_df = pd.concat([combined_df, new_df], ignore_index=True)
     else:
         final_df = new_df
@@ -114,20 +169,34 @@ def save_results(results, output_file):
     print(f"Results saved to {output_file}")
 
 
-
 def main():
-    parser = argparse.ArgumentParser(description="Compute surface dice for AZ segmentations.")
+    parser = argparse.ArgumentParser(description="Compute surface dice per GT component or globally for AZ segmentations.")
     parser.add_argument("--input_folder", "-i", required=True, help="Folder with predicted segmentations (.h5)")
     parser.add_argument("--gt_folder", "-gt", required=True, help="Folder with ground truth segmentations (.h5)")
     parser.add_argument("--version", "-v", required=True, help="Version string used in prediction key")
-    parser.add_argument("--check", action="store_true", help="Version string used in prediction key")
+    parser.add_argument("--check", action="store_true", help="Visualize intermediate outputs in Napari")
+    parser.add_argument("--global_eval", action="store_true", help="If set, compute global surface dice instead of per-component")
 
     args = parser.parse_args()
 
-    output_file = f"/user/muth9/u12095/synapse-net/scripts/cooper/revision/evaluation_results/v{args.version}_surface_dice.xlsx"
-    results = collect_results(args.input_folder, args.gt_folder, args.version, args.check)
+    min_bb_shape = (32, 384, 384)
+    min_thinning_size = 2500
+
+    suffix = "global" if args.global_eval else "per_gt_component"
+    output_file = f"/user/muth9/u12095/synapse-net/scripts/cooper/revision/evaluation_results/v{args.version}_surface_dice_{suffix}.xlsx"
+
+    results = collect_results(
+        args.input_folder,
+        args.gt_folder,
+        args.version,
+        args.check,
+        min_bb_shape=min_bb_shape,
+        min_thinning_size=min_thinning_size,
+        global_eval=args.global_eval
+    )
+
     save_results(results, output_file)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/scripts/cooper/revision/updated_data_analysis/store_results.py b/scripts/cooper/revision/updated_data_analysis/store_results.py
@@ -73,20 +73,20 @@ def save_filtered_dataframes(output_dir, tomogram_name, df):
         'AZ_distances_within_200': 200,
         'AZ_distances_within_100': 100,
         'AZ_distances_within_40': 40,
-        'AZ_distances_within_40_with_diameters': 40,
-        'AZ_distances_within_40_only_diameters': 40,
+        'AZ_distances_within_100_with_diameters': 100,
+        'AZ_distances_within_100_only_diameters': 100,
     }
 
     for filename, max_dist in thresholds.items():
         file_path = os.path.join(output_dir, f"{filename}.xlsx")
         filtered_df = df if max_dist is None else df[df['distance'] <= max_dist]
 
-        if filename == 'AZ_distances_within_40_with_diameters':
+        if filename == 'AZ_distances_within_100_with_diameters':
             data = pd.DataFrame({
                 f"{tomogram_name}_distance": filtered_df['distance'].values,
                 f"{tomogram_name}_diameter": filtered_df['diameter'].values
             })
-        elif filename == 'AZ_distances_within_40_only_diameters':
+        elif filename == 'AZ_distances_within_100_only_diameters':
             data = pd.DataFrame({
                 f"{tomogram_name}_diameter": filtered_df['diameter'].values
             })
@@ -110,8 +110,8 @@ def save_filtered_dataframes_with_seg_id(output_dir, tomogram_name, df):
         'AZ_distances_within_200_with_seg_id': 200,
         'AZ_distances_within_100_with_seg_id': 100,
         'AZ_distances_within_40_with_seg_id': 40,
-        'AZ_distances_within_40_with_diameters_and_seg_id': 40,
-        'AZ_distances_within_40_only_diameters_and_seg_id': 40,
+        'AZ_distances_within_100_with_diameters_and_seg_id': 100,
+        'AZ_distances_within_100_only_diameters_and_seg_id': 100,
     }
 
     with_segID_dir = os.path.join(output_dir, "with_segID")
@@ -121,13 +121,13 @@ def save_filtered_dataframes_with_seg_id(output_dir, tomogram_name, df):
         file_path = os.path.join(with_segID_dir, f"{filename}.xlsx")
         filtered_df = df if max_dist is None else df[df['distance'] <= max_dist]
 
-        if filename == 'AZ_distances_within_40_with_diameters_and_seg_id':
+        if filename == 'AZ_distances_within_100_with_diameters_and_seg_id':
             data = pd.DataFrame({
                 f"{tomogram_name}_seg_id": filtered_df['seg_id'].values,
                 f"{tomogram_name}_distance": filtered_df['distance'].values,
                 f"{tomogram_name}_diameter": filtered_df['diameter'].values
             })
-        elif filename == 'AZ_distances_within_40_only_diameters_and_seg_id':
+        elif filename == 'AZ_distances_within_100_only_diameters_and_seg_id':
             data = pd.DataFrame({
                 f"{tomogram_name}_seg_id": filtered_df['seg_id'].values,
                 f"{tomogram_name}_diameter": filtered_df['diameter'].values