Update SGN subtype analysis

constantinpape · constantinpape · commit 06ac93688f0e · 2025-08-25T15:47:35.000+02:00
diff --git a/reproducibility/label_components/repro_label_components.py b/reproducibility/label_components/repro_label_components.py
@@ -6,6 +6,7 @@
 import pandas as pd
 from flamingo_tools.s3_utils import get_s3_path
 from flamingo_tools.segmentation.postprocessing import label_components_sgn, label_components_ihc
+from flamingo_tools.segmentation.cochlea_mapping import tonotopic_mapping
 
 
 def repro_label_components(
@@ -14,6 +15,7 @@ def repro_label_components(
     s3_credentials: Optional[str] = None,
     s3_bucket_name: Optional[str] = None,
     s3_service_endpoint: Optional[str] = None,
+    apply_tonotopic_mapping: bool = False,
 ):
     min_size = 1000
     default_threshold_erode = None
@@ -23,7 +25,7 @@ def repro_label_components(
     default_cell_type = "sgn"
     default_component_list = [1]
 
-    with open(ddict, 'r') as myfile:
+    with open(ddict, "r") as myfile:
         data = myfile.read()
     param_dicts = json.loads(data)
 
@@ -39,12 +41,16 @@ def repro_label_components(
         cell_type = dic["cell_type"] if "cell_type" in dic else default_cell_type
         component_list = dic["component_list"] if "component_list" in dic else default_component_list
 
-        table_name = f"{cell_type.upper()}_{unet_version}"
+        # The table name sometimes has to be over-written.
         # table_name = "PV_SGN_V2_DA"
+        # table_name = "CR_SGN_v2"
+
+        table_name = f"{cell_type.upper()}_{unet_version}"
+
         s3_path = os.path.join(f"{cochlea}", "tables", table_name, "default.tsv")
         tsv_path, fs = get_s3_path(s3_path, bucket_name=s3_bucket_name,
                                    service_endpoint=s3_service_endpoint, credential_file=s3_credentials)
-        with fs.open(tsv_path, 'r') as f:
+        with fs.open(tsv_path, "r") as f:
             table = pd.read_csv(f, sep="\t")
 
         if cell_type == "sgn":
@@ -67,8 +73,12 @@ def repro_label_components(
         else:
             print(f"Custom component(s) have {largest_comp} {cell_type.upper()}s.")
 
+        if apply_tonotopic_mapping:
+            tsv_table = tonotopic_mapping(tsv_table, cell_type=cell_type)
+
         cochlea_str = "-".join(cochlea.split("_"))
         table_str = "-".join(table_name.split("_"))
+        os.makedirs(output_dir, exist_ok=True)
         out_path = os.path.join(output_dir, "_".join([cochlea_str, f"{table_str}.tsv"]))
 
         tsv_table.to_csv(out_path, sep="\t", index=False)
@@ -78,8 +88,9 @@ def main():
     parser = argparse.ArgumentParser(
         description="Script to label segmentation using a segmentation table and graph connected components.")
 
-    parser.add_argument('-i', '--input', type=str, required=True, help="Input JSON dictionary.")
-    parser.add_argument('-o', "--output", type=str, required=True, help="Output directory.")
+    parser.add_argument("-i", "--input", type=str, required=True, help="Input JSON dictionary.")
+    parser.add_argument("-o", "--output", type=str, required=True, help="Output directory.")
+    parser.add_argument("-t", "--tonotopic_mapping", action="store_true", help="Also compute the tonotopic mapping.")
 
     parser.add_argument("--s3_credentials", type=str, default=None,
                         help="Input file containing S3 credentials. "
@@ -94,6 +105,7 @@ def main():
     repro_label_components(
         args.input, args.output,
         args.s3_credentials, args.s3_bucket_name, args.s3_service_endpoint,
+        apply_tonotopic_mapping=args.tonotopic_mapping,
     )
 
 
diff --git a/reproducibility/templates_processing/REAMDE.md b/reproducibility/templates_processing/REAMDE.md
@@ -15,7 +15,7 @@ For IHC segmentation run:
 After this, run the following to add segmentation to MoBIE, create component labelings and upload to S3:
 - templates_transfer/mobie_segmentation_template.sbatch
 - templates_transfer/s3_seg_template.sh
-- repro_label_components.py
+- label_components/repro_label_components.py
 - templates_transfer/s3_seg_template.sh
 
 For ribbon synapse detection without associated IHC segmentation run
diff --git a/reproducibility/templates_processing/apply_unet_SGN_template.sbatch b/reproducibility/templates_processing/apply_unet_SGN_template.sbatch
@@ -47,8 +47,8 @@ export MODEL=/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/train
 
 export PREDICTION_INSTANCES=10
 
-# export INPUT_KEY="setup$STAIN_CHANNEL/timepoint0/s0"
-export INPUT_KEY="s0"
+export INPUT_KEY="setup$STAIN_CHANNEL/timepoint0/s0"
+# export INPUT_KEY="s0"
 
 echo "Input directory: ${INPUT}"
 echo "Output directory: ${OUTPUT_FOLDER}"
diff --git a/reproducibility/templates_processing/mean_std_SGN_template.sbatch b/reproducibility/templates_processing/mean_std_SGN_template.sbatch
@@ -33,8 +33,8 @@ export INPUT=/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/"$COC
 export OUTPUT_FOLDER=/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/predictions/"$COCHLEA"/"$SEG_NAME"
 export SEG_CLASS="sgn"
 
-# export INPUT_KEY="setup$STAIN_CHANNEL/timepoint0/s0"
-export INPUT_KEY="s0"
+export INPUT_KEY="setup$STAIN_CHANNEL/timepoint0/s0"
+# export INPUT_KEY="s0"
 
 if ! [[ -f $OUTPUT_FOLDER ]] ; then
 	mkdir -p "$OUTPUT_FOLDER"
@@ -51,4 +51,3 @@ cmd_array=(	'import sys,os;'
 	'output_folder=os.environ["OUTPUT_FOLDER"],seg_class=os.environ["SEG_CLASS"])')
 cmd="${cmd_array[*]}"
 python -c "$cmd"
-
diff --git a/scripts/measurements/sgn_subtypes.py b/scripts/measurements/sgn_subtypes.py
@@ -1,8 +1,9 @@
 import json
 import os
+from glob import glob
+from subprocess import run
 
 import pandas as pd
-from skimage.filters import threshold_otsu
 
 from flamingo_tools.s3_utils import BUCKET_NAME, create_s3_target, get_s3_path
 from flamingo_tools.measurements import compute_object_measures
@@ -116,7 +117,7 @@ def require_missing_tables(missing_tables):
         seg_name = "PV_SGN_v2" if "PV" in COCHLEAE_FOR_SUBTYPES[cochlea] else "CR_SGN_v2"
         for missing in missing_tabs:
             channel = missing.split("_")[0]
-            print(cochlea, channel)
+            print("Computing intensities for:", cochlea, channel)
 
             img_s3 = f"{cochlea}/images/ome-zarr/{channel}.ome.zarr"
             seg_s3 = f"{cochlea}/images/ome-zarr/{seg_name}.ome.zarr"
@@ -126,7 +127,9 @@ def require_missing_tables(missing_tables):
 
             output_folder = os.path.join(output_root, cochlea)
             os.makedirs(output_folder, exist_ok=True)
-            output_table_path = os.path.join(output_folder, f"{channel}_{seg_name}_object-measures.tsv")
+            output_table_path = os.path.join(
+                output_folder, f"{channel}_{seg_name.replace('_', '-')}_object-measures.tsv"
+            )
             compute_object_measures(
                 image_path=img_path,
                 segmentation_path=seg_path,
@@ -136,17 +139,17 @@ def require_missing_tables(missing_tables):
                 segmentation_key="s0",
                 s3_flag=True,
                 component_list=[1],
-                n_threads=8,
+                n_threads=16,
             )
-            return
 
-            # TODO S3 upload
+            # S3 upload
+            run(["rclone", "--progress", "copyto", output_folder,
+                 f"cochlea-lightsheet:cochlea-lightsheet/{cochlea}/tables/{seg_name}"])
 
 
-def get_data_for_subtype_analysis():
+def compile_data_for_subtype_analysis():
     s3 = create_s3_target()
 
-    threshold_dict = {}
     output_folder = "./subtype_analysis"
     os.makedirs(output_folder, exist_ok=True)
 
@@ -176,47 +179,74 @@ def get_data_for_subtype_analysis():
         table = table[table.component_labels == 1]
         valid_sgns = table.label_id
 
-        output_table = {"label_id": table.label_id.values}
-        threshold_dict[cochlea] = {}
+        output_table = {"label_id": table.label_id.values, "frequency[kHz]": table["frequency[kHz]"]}
 
         # Analyze the different channels (= different subtypes).
+        reference_intensity = None
         for channel in channels:
             # Load the intensity table.
-            intensity_path = os.path.join(table_folder, f"{channel}_PV-SGN-v2_object-measures.tsv")
-            try:
-                table_content = s3.open(intensity_path, mode="rb")
-            except FileNotFoundError:
-                print(intensity_path, "is missing")
-                continue
+            intensity_path = os.path.join(table_folder, f"{channel}_{seg_name.replace('_', '-')}_object-measures.tsv")
+            table_content = s3.open(intensity_path, mode="rb")
+
             intensities = pd.read_csv(table_content, sep="\t")
             intensities = intensities[intensities.label_id.isin(valid_sgns)]
             assert len(table) == len(intensities)
             assert (intensities.label_id.values == table.label_id.values).all()
 
-            # Intensity based analysis.
             medians = intensities["median"].values
-
-            # TODO: we need to determine the threshold in a better way / validate it in MoBIE.
-            intensity_threshold = THRESHOLDS.get(cochlea, {}).get(channel, None)
-            if intensity_threshold is None:
-                print("Could not find a threshold for", cochlea, channel, "falling back to OTSU")
-                intensity_threshold = float(threshold_otsu(medians))
-            threshold_dict[cochlea][channel] = intensity_threshold
-
-            subtype = CHANNEL_TO_TYPE[channel]
             output_table[f"{channel}_median"] = medians
-            output_table[f"is_{subtype}"] = medians > intensity_threshold
-
-        # Add the frequency mapping.
-        # TODO
+            if channel == reference_channel:
+                reference_intensity = medians
+            else:
+                assert reference_intensity is not None
+                output_table[f"{channel}_ratio_{reference_channel}"] = medians / reference_intensity
 
         out_path = os.path.join(output_folder, f"{cochlea}_subtype_analysis.tsv")
         output_table = pd.DataFrame(output_table)
-        output_table.to_csv(out_path, sep="\t")
+        output_table.to_csv(out_path, sep="\t", index=False)
+
+
+def _plot_histogram(table, column, name, show_plots):
+    data = table[column].values
+
+    # TODO determine automatic threshold
 
-    threshold_out = os.path.join(output_folder, "thresholds.json")
-    with open(threshold_out, "w") as f:
-        json.dump(threshold_dict, f, sort_keys=True, indent=4)
+    if show_plots:
+        pass
+    else:
+        pass
+
+
+# TODO enable over-writing by manual thresholds
+def analyze_subtype_data(show_plots=True):
+    files = sorted(glob("./subtype_analysis/*.tsv"))
+
+    for ff in files:
+        cochlea = os.path.basename(ff)[:-len("_subtype_analysis.tsv")]
+        print(cochlea)
+        channels = COCHLEAE_FOR_SUBTYPES[cochlea]
+        reference_channel = "PV" if "PV" in channels else "CR"
+        assert channels[0] == reference_channel
+
+        tab = pd.read_csv(ff, sep="\t")
+        breakpoint()
+
+        # 1.) Plot simple intensity histograms, including otsu threshold.
+        for chan in channels:
+            column = f"{chan}_median"
+            name = f"{cochlea}_{chan}_histogram.png"
+            _plot_histogram(tab, column, name, show_plots)
+
+        # 2.) Plot ratio histograms, including otsu threshold.
+        ratios = {}
+        # TODO ratio based classification and overlay in 2d plot?
+        for chan in channels[1:]:
+            column = f"{chan}_median_ratio_{reference_channel}"
+            name = f"{cochlea}_{chan}_histogram_ratio_{reference_channel}.png"
+            _plot_histogram(tab, column, name, show_plots)
+            ratios[f"{chan}_{reference_channel}"] = tab[column].values
+
+        # 3.) Plot 2D space of ratios.
 
 
 # General notes:
@@ -229,7 +259,9 @@ def main():
     missing_tables = check_processing_status()
     require_missing_tables(missing_tables)
 
-    # analyze_subtypes_intensity_based()
+    # compile_data_for_subtype_analysis()
+
+    # analyze_subtype_data()
 
 
 if __name__ == "__main__":
diff --git a/scripts/more-annotations/extract_sgn_annotations.py b/scripts/more-annotations/extract_sgn_annotations.py
@@ -94,13 +94,19 @@ def downscale_segmentation():
             )
 
 
+# Note: consider different normalization strategy for these cochleae and normalize by local intensity
+# rather than by global values.
+
 # Also double check empty positions again and make sure they don't contain SGNs
 # Additional positions for LaVision annotations:
 # {"position":[2031.0655170248258,1925.206039671767,249.14546086048554],"timepoint":0}
 # {"position":[2378.3720460599393,2105.471228531872,303.9285928812524],"timepoint":0}
 # {"position":[1619.3251178227529,3444.7351705689553,271.2360278843609],"timepoint":0}
 # {"position":[2358.2784398426843,1503.2211953830192,762.7325586759833],"timepoint":0}
 
+# Position in Marmoset:
+# {"position":[2462.7875134103206,2818.067344942212,1177.1380214828991],"timepoint":0}
+
 
 def main():
     # download_lavision_crops()