changes to semisupervised_training.py domain_adaptation.py

stmartineau99 · stmartineau99 · commit e3a46c94d022 · 2025-10-30T15:54:17.000+01:00
diff --git a/scripts/cryo/actin/predict_actin.py b/scripts/cryo/actin/predict_actin.py
@@ -1,54 +1,82 @@
 import os
 from glob import glob
 from pathlib import Path
+from typing import Optional
 
 import h5py
 import numpy as np
 from elf.io import open_file
+from synapse_net.training.supervised_training import get_3d_model
 from synapse_net.inference.actin import segment_actin
+import torch_em
+import torch
 
+def predict_actin(input_dir, model_path, output_dir, device: int=0, torch_load: bool=False, state_key: Optional[str]=None):
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-# Run prediction on the actin val volume.
-def predict_actin_val():
-    path = "/mnt/lustre-grete/usr/u12086/data/deepict/deepict_actin/00012.h5"
+    model_path = Path(model_path)
+    model_name = model_path.stem 
 
-    # This is the validation ROI.
-    roi = np.s_[250:, :, :]
-    with h5py.File(path, "r") as f:
-        raw = f["raw"][roi]
+    if torch_load:
+        ckpt = str(model_path / "best.pt")
+        x = torch.load(ckpt, map_location=f"cuda:{device}", weights_only=False)
+        model = get_3d_model(out_channels=2)
+        if state_key is None:
+            state_key = "model_state"
+        model.load_state_dict(x[state_key])
+    else:
+        model = torch_em.util.load_model(str(model_path), device=f"cuda:{device}")
 
-    model_path = "./checkpoints/actin-deepict"
-    seg, pred = segment_actin(raw, model_path, verbose=True, return_predictions=True)
+    for data_path in input_dir.glob("*.h5"):
+        with h5py.File(data_path, "r") as f:
+            raw = f["raw"][:]
+            labels = f["labels/actin"][:]
 
-    with h5py.File("actin_pred.h5", "a") as f:
-        f.create_dataset("raw", data=raw, compression="gzip")
-        f.create_dataset("actin_seg", data=seg, compression="gzip")
-        f.create_dataset("actin_pred", data=pred, compression="gzip")
+        seg, pred = segment_actin(raw, model=model, verbose=True, return_predictions=True)
 
+        output_path = output_dir / f"{data_path.stem}.h5"
 
-def predict_actin_fb():
-    root = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/fernandez-busnadiego/from_arsen/tomos_actin_18924"  # noqa
-    files = glob(os.path.join(root, "*.mrc"))
+        print(f"Writing prediction to {output_path}.")
+        with h5py.File(output_path, "a") as f:
+            if "raw" not in f:
+                f.create_dataset("raw", data=raw, compression="gzip")
+            if "labels/actin" not in f:
+                f.create_dataset("labels/actin", data=labels, compression="gzip")
+            f.create_dataset(f"predictions/{model_name}", data=pred, compression="gzip")
+            f.create_dataset(f"segmentations/{model_name}", data=seg, compression="gzip")
 
-    model_path = "./checkpoints/actin-adapted-v1"
-
-    for ff in files:
-        print("Predict", ff)
-        with open_file(ff, "r") as f:
-            raw = f["data"][:]
-        seg, pred = segment_actin(raw, model_path, verbose=True, return_predictions=True)
-
-        out_path = f"{Path(ff).stem}.h5"
-        with h5py.File(out_path, "a") as f:
-            # f.create_dataset("raw", data=raw, compression="gzip")
-            f.create_dataset("actin_seg", data=seg, compression="gzip")
-            f.create_dataset("actin_pred", data=pred, compression="gzip")
+def main():
+    MODEL_DIR = Path("/mnt/data1/sage/synapse-net/scripts/cryo/actin/output")
+    PRED_DIR = Path("/mnt/data1/sage/synapse-net/scripts/cryo/actin/predictions")
 
+    predict_actin(
+        input_dir = "/mnt/data1/sage/actin-segmentation/data/deepict/deepict_actin/test",
+        model_path = MODEL_DIR / "experiment2/run3/checkpoints/actin-adapted-opto2deepict-v2",
+        output_dir = PRED_DIR / "deepict",
+        device = 3,
+        torch_load=True,
+        state_key="teacher_state"
+    ) 
 
-def main():
-    # predict_actin_val()
-    predict_actin_fb()
+    predict_actin(
+        input_dir = "/mnt/data1/sage/actin-segmentation/data/deepict/deepict_actin/test",
+        model_path = MODEL_DIR / "experiment1/run1/checkpoints/actin-deepict-v3",
+        output_dir = PRED_DIR / "deepict",
+        device = 3,
+        torch_load=True,
+        state_key="model_state"
+    )
 
+    predict_actin(
+        input_dir = "/mnt/data1/sage/actin-segmentation/data/EMPIAR-12292/h5/test",
+        model_path = MODEL_DIR / "experiment1/run3/checkpoints/actin-adapted-deepict2opto-v2",
+        output_dir = PRED_DIR / "opto",
+        device = 3,
+        torch_load=True,
+        state_key="teacher_state"
+    ) 
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/cryo/actin/surface_dice.py b/scripts/cryo/actin/surface_dice.py
@@ -0,0 +1,211 @@
+#!/bin/env python3
+import sys
+import os
+
+# Add membrain-seg to Python path 
+MEMBRAIN_SEG_PATH = "/home/sage/membrain-seg/src"
+if MEMBRAIN_SEG_PATH not in sys.path:
+    sys.path.insert(0, MEMBRAIN_SEG_PATH)
+
+import argparse
+import h5py
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from scipy.ndimage import label
+from skimage.measure import regionprops
+
+try:
+    from membrain_seg.segmentation.skeletonize import skeletonization
+    from membrain_seg.benchmark.metrics import masked_surface_dice
+except ImportError:
+    raise ImportError("membrain_seg not found in path. Download source code:" \
+    "https://github.com/teamtomo/membrain-seg/tree/main/src/membrain_seg")
+    exit()
+
+def load_segmentation(file_path, key):
+    with h5py.File(file_path, "r") as f:
+        data = f[key][:]
+    return data
+
+def evaluate_surface_dice(pred, gt, raw, check):
+    gt_skeleton = skeletonization(gt == 1, batch_size=100000)
+    pred_skeleton = skeletonization(pred, batch_size=100000)
+    mask = gt != 2
+
+    if check:
+        import napari
+        v = napari.Viewer()
+        v.add_image(raw)
+        v.add_labels(gt, name="gt")
+        v.add_labels(gt_skeleton.astype(np.uint16), name="gt_skeleton")
+        v.add_labels(pred, name="pred")
+        v.add_labels(pred_skeleton.astype(np.uint16), name="pred_skeleton")
+    
+        napari.run()
+
+    surf_dice, confusion_dict = masked_surface_dice(
+        pred_skeleton, gt_skeleton, pred, gt, mask
+    )
+    return surf_dice, confusion_dict
+
+
+def process_file(pred_path, gt_path, seg_key, gt_key, check,
+                 min_bb_shape=(64, 384, 384), min_thinning_size=2500,
+                 global_eval=False):
+    try:
+        pred = load_segmentation(pred_path, seg_key)
+        gt = load_segmentation(gt_path, gt_key)
+        raw = load_segmentation(gt_path, "raw")
+
+        if global_eval:
+            gt_bin = (gt == 1).astype(np.uint8)
+            pred_bin = pred.astype(np.uint8)
+
+            dice, confusion = evaluate_surface_dice(pred_bin, gt_bin, raw, check)
+            return [{
+                "tomo_name": os.path.basename(pred_path),
+                "gt_component_id": -1,  # -1 indicates global eval
+                "surface_dice": dice,
+                **confusion
+            }]
+
+        labeled_gt, _ = label(gt == 1)
+        props = regionprops(labeled_gt)
+        results = []
+
+        for prop in props:
+            if prop.area < min_thinning_size:
+                continue
+
+            comp_id = prop.label
+            bbox_start = prop.bbox[:3]
+            bbox_end = prop.bbox[3:]
+            bbox = tuple(slice(start, stop) for start, stop in zip(bbox_start, bbox_end))
+
+            pad_width = [
+                max(min_shape - (sl.stop - sl.start), 0) // 2
+                for sl, min_shape in zip(bbox, min_bb_shape)
+            ]
+
+            expanded_bbox = tuple(
+                slice(
+                    max(sl.start - pw, 0),
+                    min(sl.stop + pw, dim)
+                )
+                for sl, pw, dim in zip(bbox, pad_width, gt.shape)
+            )
+
+            gt_crop = (labeled_gt[expanded_bbox] == comp_id).astype(np.uint8)
+            pred_crop = pred[expanded_bbox].astype(np.uint8)
+            raw_crop = raw[expanded_bbox]
+
+            try:
+                dice, confusion = evaluate_surface_dice(pred_crop, gt_crop, raw_crop, check)
+            except Exception as e:
+                print(f"Error computing Dice for GT component {comp_id} in {pred_path}: {e}")
+                continue
+
+            result = {
+                "tomo_name": os.path.basename(pred_path),
+                "gt_component_id": comp_id,
+                "surface_dice": dice,
+                **confusion
+            }
+            results.append(result)
+
+        return results
+
+    except Exception as e:
+        print(f"Error processing {pred_path}: {e}")
+        return []
+
+
+def collect_results(input_folder, gt_folder, model_name, check=False,
+                    min_bb_shape=(32, 384, 384), min_thinning_size=2500,
+                    global_eval=False):
+    results = []
+    seg_key = f"/segmentations/{model_name}"
+    gt_key = "/labels/actin"
+    input_folder_name = os.path.basename(os.path.normpath(input_folder))
+
+    for fname in tqdm(os.listdir(input_folder), desc="Processing segmentations"):
+        if not fname.endswith(".h5"):
+            continue
+
+        pred_path = os.path.join(input_folder, fname)
+        print(pred_path)
+        gt_path = os.path.join(gt_folder, fname)
+
+        if not os.path.exists(gt_path):
+            print(f"Warning: Ground truth file not found for {fname}")
+            continue
+
+        file_results = process_file(
+            pred_path, gt_path, seg_key, gt_key, check,
+            min_bb_shape=min_bb_shape,
+            min_thinning_size=min_thinning_size,
+            global_eval=global_eval
+        )
+
+        for res in file_results:
+            res["input_folder"] = input_folder_name
+            results.append(res)
+
+    return results
+
+
+def save_results(results, output_file):
+    new_df = pd.DataFrame(results)
+
+    if os.path.exists(output_file):
+        existing_df = pd.read_excel(output_file)
+
+        combined_df = existing_df[
+            ~existing_df.set_index(["tomo_name", "input_folder", "gt_component_id"]).index.isin(
+                new_df.set_index(["tomo_name", "input_folder", "gt_component_id"]).index
+            )
+        ]
+
+        final_df = pd.concat([combined_df, new_df], ignore_index=True)
+    else:
+        final_df = new_df
+
+    final_df.to_excel(output_file, index=False)
+    print(f"Results saved to {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compute surface dice per GT component or globally for actin segmentations.")
+    parser.add_argument("--input_folder", "-i", required=True, help="Folder with predicted segmentations (.h5)")
+    parser.add_argument("--gt_folder", "-gt", required=True, help="Folder with ground truth segmentations (.h5)")
+    parser.add_argument("--model_name", "-m", required=True, help="Model name string used in prediction key")
+    parser.add_argument("--check", action="store_true", help="Visualize intermediate outputs in Napari")
+    parser.add_argument("--global_eval", action="store_true", help="If set, compute global surface dice instead of per-component")
+
+    args = parser.parse_args()
+
+    min_bb_shape = (32, 464, 464)
+    min_thinning_size = 2500
+
+    suffix = "global" if args.global_eval else "per_gt_component"
+  
+    output_file = f"./evaluation_results/{args.model_name}_surface_dice_{suffix}.xlsx"
+    output_dir = os.path.dirname(output_file)
+    os.makedirs(output_dir, exist_ok=True)
+
+    results = collect_results(
+        args.input_folder,
+        args.gt_folder,
+        args.model_name,
+        args.check,
+        min_bb_shape=min_bb_shape,
+        min_thinning_size=min_thinning_size,
+        global_eval=args.global_eval
+    )
+
+    save_results(results, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/synapse_net/training/domain_adaptation.py b/synapse_net/training/domain_adaptation.py
@@ -110,7 +110,9 @@ def _train_epoch_unsupervised(self, progress, forward_context, backprop):
         # Sample from both the supervised and unsupervised loader.
         for xu1, xu2 in self.unsupervised_train_loader:
             
-            # Assuming shape (B, C, D, H, W), only keep the first channel for xu2 (student input).
+            # Keep only the first channel for xu2 (student input).
+            if xu2.ndim != 5:
+                raise ValueError(f"Expect xu2 to have 5 dimensions (B, C, D, H, W), got shape {xu2.shape}.")
             if xu2.shape[1] > 1:
                 xu2 = xu2[:, :1].contiguous()
 
@@ -123,6 +125,8 @@ def _train_epoch_unsupervised(self, progress, forward_context, backprop):
                 pseudo_labels, label_filter = self.pseudo_labeler(self.teacher, teacher_input)
 
             # Drop the second channel for xu1 (teacher input) after computing the pseudo labels.
+            if xu1.ndim != 5:
+                raise ValueError(f"Expect xu1 to have 5 dimensions (B, C, D, H, W), got shape {xu1.shape}.")
             if xu1.shape[1] > 1:
                 xu1 = xu1[:, :1].contiguous()
 
@@ -184,7 +188,7 @@ def mean_teacher_adaptation(
     train_background_mask_paths: Optional[Tuple[str]] = None,
     patch_sampler: Optional[callable] = None,
     pseudo_label_sampler: Optional[callable] = None,
-    device: int = 0,
+    device: Optional[torch.device] = None,
 ) -> None:
     """Run domain adapation to transfer a network trained on a source domain for a supervised
     segmentation task to perform this task on a different target domain.
@@ -197,11 +201,9 @@ def mean_teacher_adaptation(
 
     Args:
         name: The name for the checkpoint to be trained.
-        unsupervsied_train_paths: Filepaths to the hdf5 files or similar file formats
-            for the training data in the target domain.
+        unsupervsied_train_paths: Filepaths to the hdf5 or mrc files for the training data in the target domain.
             This training data is used for unsupervised learning, so it does not require labels.
-        unsupervised_val_paths: Filepaths to the hdf5 files or similar file formats
-            for the validation data in the target domain.
+        unsupervised_val_paths: Filepaths to the hdf5 or mrc files for the validation data in the target domain.
             This validation data is used for unsupervised learning, so it does not require labels.
         patch_shape: The patch shape used for a training example.
             In order to run 2d training pass a patch shape with a singleton in the z-axis,
@@ -231,9 +233,9 @@ def mean_teacher_adaptation(
             based on the patch_shape and size of the volumes used for validation.
         train_sample_mask_paths: Filepaths to the sample masks used by the patch sampler to accept or reject 
             patches for training.
-        val_sample_mask_paths: Filepaths to the sample masks used by the patch sampler to accept or reject 
+        val_sample_mask_paths: Filepaths to the sample masks mrc files used by the patch sampler to accept or reject 
             patches for validation. 
-        train_background_mask_paths: Filepaths to the background masks used for training.
+        train_background_mask_paths: Filepaths to the background masks mrc files used for training.
             Background masks are used to subtract background from the pseudo labels before the forward pass. 
         patch_sampler: A sampler for rejecting patches based on a defined conditon. 
         pseudo_label_sampler: A sampler for rejecting pseudo-labels based on a defined condition.
diff --git a/synapse_net/training/semisupervised_training.py b/synapse_net/training/semisupervised_training.py