Update subtype analysis

constantinpape · constantinpape · commit 94774a0d059e · 2025-09-17T12:42:08.000+02:00
diff --git a/scripts/measurements/sgn_subtypes.py b/scripts/measurements/sgn_subtypes.py
@@ -2,7 +2,6 @@
 import os
 import sys
 from glob import glob
-from subprocess import run
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -31,21 +30,6 @@
     "M_LR_000099_L", "M_LR_000214_L", "M_AMD_N62_L", "M_LR_000184_R", "M_LR_000184_L"
 ]
 
-# Map from channels to subtypes.
-# Comment Aleyna:
-# The signal will be a gradient between different subtypes:
-# For example CR is expressed more, is brigther,
-# in type 1a SGNs but exist in type Ib SGNs and to a lesser extent in type 1c.
-# Same is also true for other markers so we will need to set a threshold for each.
-# Luckily the signal seems less variable compared to GFP.
-CHANNEL_TO_TYPE = {
-    "CR": "Type-Ia",
-    "Calb1": "Type-Ib",
-    "Lypd1": "Type-Ic",
-    "Prph": "Type-II",
-    "Ntng1": "Type-Ib/c",
-}
-
 # For custom thresholds.
 THRESHOLDS = {
     "M_LR_000214_L": {
@@ -55,29 +39,50 @@
 }
 
 # For consistent colors.
-ALL_COLORS = ["red", "blue", "orange", "yellow", "cyan", "magenta", "green", "purple"]
+ALL_COLORS = ["red", "blue", "orange", "yellow", "cyan", "magenta", "green", "purple", "gray", "black"]
 COLORS = {}
 
 PLOT_OUT = "./subtype_plots"
 
-# TODO: updates based on Aleyna's feedback.
-# Subtype mapping
 
-# Combined visualization for the cochleae
-# Can we visualize the tonotopy in subtypes and not stainings?
-# It would also be good to have subtype percentages per cochlea and pooled together as a diagram and tonotopy?
-# This would help to see if different staining gives same/similar results.
 # Type Ia ; CR+ / Calb1- or Calb1- / Lypd1-
 # Type Ib: CR+ / Calb1+ or Calb1+ / Lypd1+
 # Type Ic: CR-/Calb1+ - or Calb1- / Lypd1+
 # Type II: CR-/Calb1- or Calb1- / Lypd1- or Prph+
+def stain_to_type(stain):
+    # Normalize the staining string.
+    stains = stain.replace(" ", "").split("/")
+    assert len(stains) in (1, 2)
 
-# > It's good to see that for the N mice the Ntng1C and Lypd1 separate from CR so well on the thresholds. Can I visualize these samples ones segmentation masks are done to verify the Ntng1C thresholds? As this is a quite clear signal I'm not sure if taking the middle of the histogram would be the best choice.
-# The segmentations are in MoBIE already. I need to send you the tables for analyzing the signals. Will send them later.
+    if len(stains) == 1:
+        stain_norm = stain
+    else:
+        s1, s2 = sorted(stains)
+        stain_norm = f"{s1}/{s2}"
 
-# > Where are we at PV-Prph segmentation results from MLR184_L and R for SGN type II analysis? This would hopefully give <5% Prph+ cells.
-# The cochleae are in MoBIE. Segmentation and Prph signal look good! I will include it in the next analysis.
-# Need tonotopic mapping from Martin and then compute the intensities.
+    stain_to_type = {
+        # Combinations of Calb1 and CR:
+        "CR+/Calb1+": "Type Ib",
+        "CR-/Calb1+": "Type Ib/Ic",  # Calb1 is expressed at Ic less than Lypd1 but more then CR
+        "CR+/Calb1-": "Type Ia",
+        "CR-/Calb1-": "Type II",
+
+        # Combinations of Calb1 and Lypd1:
+        "Calb1+/Lypd1+": "Type Ib/Ic",
+        "Calb1+/Lypd1-": "Type Ib",
+        "Calb1-/Lypd1+": "Type Ic",
+        "Calb1-/Lypd1-": "inconclusive",  # Can be Type Ia or Type II
+
+        # Prph is isolated.
+        "Prph+": "Type II",
+        "Prph-": "Type I",
+    }
+
+    if stain_norm not in stain_to_type:
+        breakpoint()
+        raise ValueError(f"Invalid stain combination: {stain_norm}")
+
+    return stain_to_type[stain_norm], stain_norm
 
 
 def check_processing_status():
@@ -189,8 +194,9 @@ def require_missing_tables(missing_tables):
             )
 
             # S3 upload
-            run(["rclone", "--progress", "copyto", output_folder,
-                 f"cochlea-lightsheet:cochlea-lightsheet/{cochlea}/tables/{seg_name}"])
+            # from subprocess import run
+            # run(["rclone", "--progress", "copyto", output_folder,
+            #      f"cochlea-lightsheet:cochlea-lightsheet/{cochlea}/tables/{seg_name}"])
 
 
 def compile_data_for_subtype_analysis():
@@ -209,14 +215,20 @@ def compile_data_for_subtype_analysis():
             assert "CR" in channels
             reference_channel = "CR"
             seg_name = "CR_SGN_v2"
-        reference_channel, seg_name
 
         content = s3.open(f"{BUCKET_NAME}/{cochlea}/dataset.json", mode="r", encoding="utf-8")
         info = json.loads(content.read())
         sources = info["sources"]
 
         # Load the segmentation table.
-        seg_source = sources[seg_name]
+        try:
+            seg_source = sources[seg_name]
+        except KeyError as e:
+            if seg_name == "PV_SGN_v2":
+                seg_source = sources["SGN_v2"]
+                seg_name = "SGN_v2"
+            else:
+                raise e
         table_folder = os.path.join(
             BUCKET_NAME, cochlea, seg_source["segmentation"]["tableData"]["tsv"]["relativePath"]
         )
@@ -232,12 +244,19 @@ def compile_data_for_subtype_analysis():
         # Analyze the different channels (= different subtypes).
         reference_intensity = None
         for channel in channels:
-            # Load the intensity table.
-            intensity_path = os.path.join(table_folder, f"{channel}_{seg_name.replace('_', '-')}_object-measures.tsv")
-            table_content = s3.open(intensity_path, mode="rb")
+            # Load the intensity table, prefer local.
+            table_name = f"{channel}_{seg_name.replace('_', '-')}_object-measures.tsv"
+            intensity_path = os.path.join("object_measurements", cochlea, table_name)
+
+            if os.path.exists(intensity_path):
+                intensities = pd.read_csv(intensity_path, sep="\t")
+            else:
+                intensity_path = os.path.join(table_folder, table_name)
+                table_content = s3.open(intensity_path, mode="rb")
+
+                intensities = pd.read_csv(table_content, sep="\t")
+                intensities = intensities[intensities.label_id.isin(valid_sgns)]
 
-            intensities = pd.read_csv(table_content, sep="\t")
-            intensities = intensities[intensities.label_id.isin(valid_sgns)]
             assert len(table) == len(intensities)
             assert (intensities.label_id.values == table.label_id.values).all()
 
@@ -258,11 +277,20 @@ def _plot_histogram(table, column, name, show_plots, class_names=None, apply_thr
     data = table[column].values
     threshold = threshold_otsu(data)
 
+    if class_names is not None:
+        assert len(class_names) == 2
+        c0, c1 = class_names
+        subtype_classification = [c0 if datum < threshold else c1 for datum in data]
+
     fig, ax = plt.subplots(1)
     ax.hist(data, bins=24)
     if apply_threshold:
         ax.axvline(x=threshold, color='red', linestyle='--')
-        ax.set_title(f"{name}\n threshold: {threshold}")
+        if class_names is None:
+            ax.set_title(f"{name}\n threshold: {threshold}")
+        else:
+            pos_perc = len([st for st in subtype_classification if st == c1]) / float(len(subtype_classification))
+            ax.set_title(f"{name}\n threshold: {threshold}\n %{c1}: {pos_perc * 100}")
     else:
         ax.set_title(name)
 
@@ -271,11 +299,9 @@ def _plot_histogram(table, column, name, show_plots, class_names=None, apply_thr
     else:
         os.makedirs(PLOT_OUT, exist_ok=True)
         plt.savefig(f"{PLOT_OUT}/{name}.png")
+    plt.close()
 
     if class_names is not None:
-        assert len(class_names) == 2
-        c0, c1 = class_names
-        subtype_classification = [c0 if datum < threshold else c1 for datum in data]
         return subtype_classification
 
 
@@ -310,6 +336,7 @@ def _plot_2d(ratios, name, show_plots, classification=None, colors=None):
     else:
         os.makedirs(PLOT_OUT, exist_ok=True)
         plt.savefig(f"{PLOT_OUT}/{name}.png")
+    plt.close()
 
 
 def _plot_tonotopic_mapping(freq, classification, name, colors, show_plots):
@@ -324,6 +351,11 @@ def _plot_tonotopic_mapping(freq, classification, name, colors, show_plots):
     fig, ax = plt.subplots(figsize=(8, 4))
     for cat, vals in frequency_mapped.items():
         ax.scatter(x_positions, vals.value, label=cat, color=colors[cat])
+
+    main_ticks = range(len(bin_labels))
+    ax.set_xticks(main_ticks)
+    ax.set_xticklabels(bin_labels)
+    ax.set_xlabel("Octave band (kHz)")
     ax.legend()
     ax.set_title(name)
 
@@ -334,12 +366,103 @@ def _plot_tonotopic_mapping(freq, classification, name, colors, show_plots):
         plt.savefig(f"{PLOT_OUT}/{name}.png")
     plt.close()
 
+    return frequency_mapped
+
+
+# Combined visualization for the cochleae
+# Can we visualize the tonotopy in subtypes and not stainings?
+# It would also be good to have subtype percentages per cochlea and pooled together as a diagram and tonotopy?
+# This would help to see if different staining gives same/similar results.
+def combined_analysis(results, show_plots):
+    #
+    # Create the tonotopic mapping.
+    #
+    summary = {}
+    for cochlea, result in results.items():
+        if cochlea == "M_LR_000214_L":  # One of the signals cannot be analyzed.
+            continue
+        mapping = result["tonotopic_mapping"]
+        summary[cochlea] = mapping
+
+    colors = {}
+
+    fig, axes = plt.subplots(len(summary), sharey=True, figsize=(8, 8))
+    for i, (cochlea, frequency_mapped) in enumerate(summary.items()):
+        ax = axes[i]
+
+        result = next(iter(frequency_mapped.values()))
+        bin_labels = pd.unique(result["octave_band"])
+        band_to_x = {band: i for i, band in enumerate(bin_labels)}
+        x_positions = result["octave_band"].map(band_to_x)
+
+        for cat, vals in frequency_mapped.items():
+            values = vals.value
+            cat = cat[:cat.find(" (")]
+            if cat not in colors:
+                current_colors = list(colors.values())
+                next_color = ALL_COLORS[len(current_colors)]
+                colors[cat] = next_color
+            ax.scatter(x_positions, values, label=cat, color=colors[cat])
+
+        main_ticks = range(len(bin_labels))
+        ax.set_xticks(main_ticks)
+        ax.set_xticklabels(bin_labels)
+        ax.set_title(cochlea)
+        ax.legend()
+
+    ax.set_xlabel("Octave band (kHz)")
+    plt.tight_layout()
+    if show_plots:
+        plt.show()
+    else:
+        plt.savefig("./subtype_plots/overview_tonotopic_mapping.png")
+        plt.close()
+
+    #
+    # Create the overview figure.
+    #
+    summary, types = {}, []
+    for cochlea, result in results.items():
+        if cochlea == "M_LR_000214_L":  # One of the signals cannot be analyzed.
+            continue
+
+        classification = result["classification"]
+        classification = [cls[:cls.find(" (")] for cls in classification]
+        n_tot = len(classification)
+
+        this_types = list(set(classification))
+        types.extend(this_types)
+        summary[cochlea] = {}
+        for stype in types:
+            n_type = len([cls for cls in classification if cls == stype])
+            type_ratio = float(n_type) / n_tot
+            summary[cochlea][stype] = type_ratio
+
+    types = list(set(types))
+    df = pd.DataFrame(summary).fillna(0)  # missing values → 0
+
+    # Transpose → cochleae on x-axis, subtypes stacked
+    ax = df.T.plot(kind="bar", stacked=True, figsize=(8, 5))
+
+    ax.set_ylabel("Fraction")
+    ax.set_xlabel("Cochlea")
+    ax.set_title("Subtype Fractions per Cochlea")
+    plt.xticks(rotation=0)
+    plt.tight_layout()
+
+    if show_plots:
+        plt.show()
+    else:
+        plt.savefig("./subtype_plots/overview.png")
+        plt.close()
+
 
 def analyze_subtype_data_regular(show_plots=True):
     global PLOT_OUT, COLORS  # noqa
     PLOT_OUT = "subtype_plots/regular_mice"
 
     files = sorted(glob("./subtype_analysis/*.tsv"))
+    results = {}
 
     for ff in files:
         cochlea = os.path.basename(ff)[:-len("_subtype_analysis.tsv")]
@@ -354,10 +477,10 @@ def analyze_subtype_data_regular(show_plots=True):
         tab = pd.read_csv(ff, sep="\t")
 
         # 1.) Plot simple intensity histograms, including otsu threshold.
-        # for chan in channels:
-        #     column = f"{chan}_median"
-        #     name = f"{cochlea}_{chan}_histogram"
-        #     _plot_histogram(tab, column, name, show_plots, apply_threshold=chan != reference_channel)
+        for chan in channels:
+            column = f"{chan}_median"
+            name = f"{cochlea}_{chan}_histogram"
+            _plot_histogram(tab, column, name, show_plots, apply_threshold=chan != reference_channel)
 
         # 2.) Plot ratio histograms, including otsu threshold.
         ratios = {}
@@ -372,9 +495,18 @@ def analyze_subtype_data_regular(show_plots=True):
             ratios[f"{chan}_{reference_channel}"] = tab[column].values
 
         # Unify the classification and assign colors
-        cls1, cls2 = classification[0], classification[1]
-        assert len(cls1) == len(cls2)
-        classification = [f"{c1} / {c2}" for c1, c2 in zip(cls1, cls2)]
+        assert len(classification) in (1, 2)
+        if len(classification) == 2:
+            cls1, cls2 = classification[0], classification[1]
+            assert len(cls1) == len(cls2)
+            classification = [f"{c1} / {c2}" for c1, c2 in zip(cls1, cls2)]
+            show_2d = True
+        else:
+            classification = classification[0]
+            show_2d = False
+
+        classification = [stain_to_type(cls) for cls in classification]
+        classification = [f"{stype} ({stain})" for stype, stain in classification]
 
         unique_labels = set(classification)
         for label in unique_labels:
@@ -391,21 +523,31 @@ def analyze_subtype_data_regular(show_plots=True):
         freq = tab["frequency[kHz]"].values
         assert len(freq) == len(classification)
         name = f"{cochlea}_tonotopic_mapping"
-        _plot_tonotopic_mapping(freq, classification, name=name, colors=COLORS, show_plots=show_plots)
+        tonotopic_mapping = _plot_tonotopic_mapping(
+            freq, classification, name=name, colors=COLORS, show_plots=show_plots
+        )
 
         # 4.) Plot 2D space of ratios.
-        name = f"{cochlea}_2d"
-        _plot_2d(ratios, name, show_plots, classification=classification, colors=COLORS)
+        if show_2d:
+            name = f"{cochlea}_2d"
+            _plot_2d(ratios, name, show_plots, classification=classification, colors=COLORS)
 
+        results[cochlea] = {"classification": classification, "tonotopic_mapping": tonotopic_mapping}
 
-# General notes:
-# See:
+    combined_analysis(results, show_plots=show_plots)
+
+
+# More TODO:
+# > It's good to see that for the N mice the Ntng1C and Lypd1 separate from CR so well on the thresholds.
+# Can I visualize these samples ones segmentation masks are done to verify the Ntng1C thresholds?
+# As this is a quite clear signal I'm not sure if taking the middle of the histogram would be the best choice.
+# The segmentations are in MoBIE already. I need to send you the tables for analyzing the signals. Will send them later.
 def main():
-    missing_tables = check_processing_status()
-    require_missing_tables(missing_tables)
-    compile_data_for_subtype_analysis()
+    # missing_tables = check_processing_status()
+    # require_missing_tables(missing_tables)
+    # compile_data_for_subtype_analysis()
 
-    # analyze_subtype_data_regular(show_plots=False)
+    analyze_subtype_data_regular(show_plots=False)
 
     # TODO
     # analyze_subtype_data_N_mice()