fixing visualisation and updating README

Barthelemy-Drabczuk · Barthelemy-Drabczuk · commit c5bb2312dda3 · 2026-02-27T17:25:57.000+01:00
diff --git a/README.md b/README.md
@@ -146,12 +146,15 @@ ls /path/to/data/TESTXX/derivatives/cortical_tiles-2026/crops/2mm
 
 ## 4. Generate Champollion Configuration
 
-Create dataset configuration files for Champollion:
+Create dataset configuration files for Champollion.
+
+> **Recommended:** always pass `--external-config` to keep the `local.yaml` outside the pipeline directory. This is required in read-only containers (Apptainer/Docker) and avoids accidentally committing paths specific to your machine.
 
 ```bash
 pixi run python3 src/generate_champollion_config.py \
     /path/to/data/TESTXX/derivatives/cortical_tiles-2026/crops/2mm \
-    --dataset TESTXX
+    --dataset TESTXX \
+    --external-config /path/to/data/TESTXX/derivatives/champollion_V1/configs/local.yaml
 ```
 
 ### Options
@@ -160,18 +163,7 @@ pixi run python3 src/generate_champollion_config.py \
 |--------|-------------|
 | `--champollion_loc` | Path to Champollion binaries (default: external/champollion_V1) |
 | `--output` | Custom output path for config files |
-| `--external-config` | External path for local.yaml (for read-only containers) |
-
-### Read-only Container Support (Apptainer)
-
-When running in a read-only container environment:
-
-```bash
-pixi run python3 src/generate_champollion_config.py \
-    /path/to/crops \
-    --dataset TESTXX \
-    --external-config /writable/path/local.yaml
-```
+| `--external-config` | Path for `local.yaml` outside the pipeline directory (recommended) |
 
 ## 5. Generate Embeddings
 
@@ -355,18 +347,42 @@ pixi run python3 src/generate_snapshots.py \
 | Option | Description |
 |--------|-------------|
 | `--morphologist_dir` | Path to Morphologist output (for sulcal graph snapshots) |
+| `--subject` | Subject folder name to visualize (e.g. `sub_0001`). When omitted the first subject found is used. |
+| `--acquisition` | Acquisition folder to use (e.g. `wk30`, `wk40`). Required when a subject has multiple segmentations. |
 | `--cortical_tiles_dir` | Path to crops/2mm/ directory (for tiles mask snapshots) |
 | `--embeddings_dir` | Path to combined embeddings (for UMAP scatter plots) |
 | `--reference_data_dir` | Path to pre-trained UMAP models and reference coordinates |
+| `--umap_region` | Comma-separated region name(s) to plot (e.g. `FColl-SRh,S.Or.`). Defaults to all regions with available models. |
 | `--output_dir` | Directory to save snapshot images |
 | `--sulcal-only` | Only generate sulcal graph snapshots |
 | `--tiles-only` | Only generate cortical tiles snapshots |
 | `--umap-only` | Only generate UMAP scatter plots |
 | `--width` / `--height` | Snapshot dimensions (default: 800x600) |
 
+### Disambiguating multiple segmentations
+
+If a subject has several Morphologist acquisitions (e.g. two timepoints `wk30` and `wk40`), the script warns and uses the first one found. Specify the acquisition explicitly to avoid ambiguity:
+
+```bash
+pixi run python3 src/generate_snapshots.py \
+    --morphologist_dir /path/to/subjects/ \
+    --subject sub_0001 --acquisition wk40 \
+    --output_dir /path/to/snapshots/ --sulcal-only
+```
+
 ### UMAP Visualization
 
-The UMAP scatter plot projects a new subject's collateral sulcus embedding onto a pre-trained 2D map fitted on 42,433 UKBioBank40 reference subjects. The reference appears as a blue cloud, with the new subject highlighted in red.
+The UMAP scatter plots project a new subject's sulcal region embeddings onto pre-trained 2D maps, one per region and hemisphere. Each plot shows a blue reference cloud (42,433 UKBioBank subjects) with the new subject highlighted in red.
+
+By default all regions for which both an embedding CSV and a pre-trained model exist in `reference_data/` are plotted. Use `--umap_region` to restrict the output:
+
+```bash
+pixi run python3 src/generate_snapshots.py \
+    --embeddings_dir /path/to/embeddings/ \
+    --reference_data_dir reference_data/ \
+    --output_dir /path/to/snapshots/ \
+    --umap-only --umap_region FColl-SRh
+```
 
 Pre-trained UMAP artifacts are stored in `reference_data/` and contain no subject identifiers (only anonymous 2D coordinates and fitted model parameters).
 
diff --git a/src/generate_snapshots.py b/src/generate_snapshots.py
@@ -316,27 +316,73 @@ def generate_tiles_snapshot(crops_dir, output_path, size=(800, 600), level=1,
     return snapshots
 
 
-COLLATERAL_FILES = {
-    "left": "FColl-SRh_left_name06-43-43--210_embeddings.csv",
-    "right": "FColl-SRh_right_name06-56-15--113_embeddings.csv",
-}
+def discover_umap_pairs(embeddings_dir, reference_data_dir, regions=None):
+    """Find (csv_path, model_path, coords_path, region, hemi) tuples.
+
+    Scans ``embeddings_dir`` for ``*_embeddings.csv`` files and matches each
+    one to pre-trained UMAP model artefacts in ``reference_data_dir``.
+
+    CSV filenames are expected to follow the pattern::
+
+        {region}_{hemi}_{identifier}_embeddings.csv
+
+    where ``hemi`` is ``left`` or ``right``.  The corresponding model files
+    must be named::
+
+        umap_{region}_{hemi}.pkl
+        umap_{region}_{hemi}_coords.npy
+
+    Args:
+        embeddings_dir: Directory containing embedding CSV files.
+        reference_data_dir: Directory containing pre-trained UMAP models.
+        regions: Optional list of region names to include.  ``None`` means
+            include all regions for which model artefacts exist.
+
+    Returns:
+        List of tuples ``(csv_path, model_path, coords_path, region, hemi)``.
+    """
+    csv_files = sorted(glob.glob(osp.join(embeddings_dir, "*_embeddings.csv")))
+    pairs = []
+    for csv_path in csv_files:
+        parts = osp.basename(csv_path).split("_")
+        if len(parts) < 3:
+            continue
+        region, hemi = parts[0], parts[1]
+        if hemi not in ("left", "right"):
+            continue
+        if regions and region not in regions:
+            continue
+        model_path = osp.join(reference_data_dir, f"umap_{region}_{hemi}.pkl")
+        coords_path = osp.join(reference_data_dir,
+                               f"umap_{region}_{hemi}_coords.npy")
+        if osp.exists(model_path) and osp.exists(coords_path):
+            pairs.append((csv_path, model_path, coords_path, region, hemi))
+        else:
+            print(f"  UMAP model not found for {region} {hemi} — skipping "
+                  f"(expected {osp.basename(model_path)} in reference_data_dir)")
+    return pairs
 
 
 def generate_umap_snapshot(embeddings_dir, reference_data_dir, output_path,
-                           size=(800, 600)):
-    """Generate UMAP scatter plots for the collateral sulcus region.
+                           size=(800, 600), regions=None):
+    """Generate UMAP scatter plots for embedding regions.
 
-    Projects the pipeline's new subject(s) onto a pre-trained UMAP fitted
-    on UKBioBank40 reference embeddings. Produces one plot per hemisphere.
+    Discovers embedding CSV files in ``embeddings_dir`` and generates a UMAP
+    projection plot for every (region, hemisphere) pair that has both a CSV and
+    pre-trained model artefacts in ``reference_data_dir``.
 
     Args:
-        embeddings_dir: Path to pipeline embeddings (stage 5 output)
-        reference_data_dir: Path to pre-trained UMAP models and coords
-        output_path: Base path for output images (suffixed with _left/_right)
-        size: Tuple of (width, height)
+        embeddings_dir: Path to pipeline embeddings directory.
+        reference_data_dir: Path to pre-trained UMAP models and reference
+            coordinate arrays.
+        output_path: Base path for output images.  Each plot is saved as
+            ``{basename}_{region}_{hemi}{ext}``.
+        size: Tuple of (width, height) in pixels.
+        regions: Optional list of region names to include.  ``None`` means
+            generate plots for all available regions.
 
     Returns:
-        List of generated snapshot file paths
+        List of generated snapshot file paths.
     """
     import joblib
     import matplotlib
@@ -348,35 +394,23 @@ def generate_umap_snapshot(embeddings_dir, reference_data_dir, output_path,
     ext = osp.splitext(output_path)[1] or ".png"
     snapshots = []
 
-    for hemi, csv_name in COLLATERAL_FILES.items():
-        region = csv_name.split("_")[0]
-        model_path = osp.join(
-            reference_data_dir, f"umap_{region}_{hemi}.pkl"
-        )
-        coords_path = osp.join(
-            reference_data_dir, f"umap_{region}_{hemi}_coords.npy"
-        )
-        if not osp.exists(model_path) or not osp.exists(coords_path):
-            print(f"  UMAP artifacts not found for {hemi}, skipping")
-            continue
+    pairs = discover_umap_pairs(embeddings_dir, reference_data_dir,
+                                regions=regions)
+    if not pairs:
+        print("  No matching (embedding CSV, UMAP model) pairs found")
+        return snapshots
 
+    for csv_path, model_path, coords_path, region, hemi in pairs:
         model = joblib.load(model_path)
         ref_coords = np.load(coords_path)
-        print(f"  [{hemi}] Loaded {ref_coords.shape[0]} reference points")
-
-        new_csv = osp.join(embeddings_dir, csv_name)
-        if not osp.exists(new_csv):
-            print(f"  [{hemi}] No embedding found at {new_csv}, skipping")
-            continue
+        print(f"  [{region} {hemi}] Loaded {ref_coords.shape[0]} reference points")
 
-        df = pd.read_csv(new_csv)
+        df = pd.read_csv(csv_path)
         X_new = df.drop(columns=["ID"]).values.astype(np.float32)
         new_coords = model.transform(X_new)
-        print(f"  [{hemi}] Projected {X_new.shape[0]} new subject(s)")
+        print(f"  [{region} {hemi}] Projected {X_new.shape[0]} new subject(s)")
 
-        fig, ax = plt.subplots(
-            figsize=(size[0] / 100, size[1] / 100)
-        )
+        fig, ax = plt.subplots(figsize=(size[0] / 100, size[1] / 100))
         ax.scatter(
             ref_coords[:, 0], ref_coords[:, 1],
             s=1, c="#4a90d9", alpha=0.08,
@@ -387,16 +421,14 @@ def generate_umap_snapshot(embeddings_dir, reference_data_dir, output_path,
             s=80, c="#e74c3c", edgecolors="white", linewidths=0.8,
             zorder=5, label="Your subject",
         )
-        ax.set_title(
-            f"Collateral sulcus \u2014 {hemi}", fontsize=12
-        )
+        ax.set_title(f"{region} \u2014 {hemi}", fontsize=12)
         ax.legend(loc="best", fontsize=9, framealpha=0.9)
         ax.set_xlabel("UMAP 1", fontsize=9)
         ax.set_ylabel("UMAP 2", fontsize=9)
         ax.tick_params(labelsize=8)
         plt.tight_layout()
 
-        snap = f"{basename}_{hemi}{ext}"
+        snap = f"{basename}_{region}_{hemi}{ext}"
         plt.savefig(snap, dpi=150)
         plt.close(fig)
         snapshots.append(snap)
@@ -440,6 +472,10 @@ def __init__(self):
          .add_optional_argument(
              "--reference_data_dir",
              "Path to pre-trained UMAP models and reference coords")
+         .add_optional_argument(
+             "--umap_region",
+             "Comma-separated list of region names to generate UMAP plots for "
+             "(e.g. FColl-SRh,S.Or.). Defaults to all regions with available models.")
          .add_optional_argument(
              "--tiles_level",
              "Region threshold level (0-3)",
@@ -568,23 +604,35 @@ def _run_tiles(self, size):
     def _run_umap(self, size):
         """Generate UMAP scatter plots."""
         snapshots = []
-        if (self.args.embeddings_dir
-                and osp.exists(self.args.embeddings_dir)
-                and self.args.reference_data_dir
-                and osp.exists(self.args.reference_data_dir)):
-            print("\nGenerating UMAP scatter plots...")
-            out = osp.join(self.args.output_dir, "umap_collateral.png")
-            try:
-                snaps = generate_umap_snapshot(
-                    self.args.embeddings_dir,
-                    self.args.reference_data_dir,
-                    out, size,
-                )
-                snapshots.extend(snaps)
-            except Exception as e:
-                print(f"  Error generating UMAP snapshot: {e}")
-        elif self.args.reference_data_dir and not osp.exists(self.args.reference_data_dir):
+
+        if not self.args.embeddings_dir:
+            return snapshots
+        if not osp.exists(self.args.embeddings_dir):
+            print(f"Embeddings directory not found: {self.args.embeddings_dir}")
+            return snapshots
+        if not self.args.reference_data_dir:
+            return snapshots
+        if not osp.exists(self.args.reference_data_dir):
             print(f"Reference data directory not found: {self.args.reference_data_dir}")
+            return snapshots
+
+        regions = None
+        umap_region = getattr(self.args, "umap_region", None)
+        if umap_region:
+            regions = [r.strip() for r in umap_region.split(",")]
+            print(f"\nUMAP region filter: {', '.join(regions)}")
+
+        print("\nGenerating UMAP scatter plots...")
+        out = osp.join(self.args.output_dir, "umap.png")
+        try:
+            snaps = generate_umap_snapshot(
+                self.args.embeddings_dir,
+                self.args.reference_data_dir,
+                out, size, regions=regions,
+            )
+            snapshots.extend(snaps)
+        except Exception as e:
+            print(f"  Error generating UMAP snapshot: {e}")
         return snapshots