Add script for measuring the number of SGNs

constantinpape · constantinpape · commit efd164a3cd25 · 2025-05-02T14:22:06.000+02:00
diff --git a/scripts/measurements/measure_sgns.py b/scripts/measurements/measure_sgns.py
@@ -0,0 +1,48 @@
+import json
+import os
+
+import numpy as np
+import pandas as pd
+from flamingo_tools.s3_utils import create_s3_target, BUCKET_NAME
+
+
+def open_json(fs, path):
+    s3_path = os.path.join(BUCKET_NAME, path)
+    with fs.open(s3_path, "r") as f:
+        content = json.load(f)
+    return content
+
+
+def open_tsv(fs, path):
+    s3_path = os.path.join(BUCKET_NAME, path)
+    with fs.open(s3_path, "r") as f:
+        table = pd.read_csv(f, sep="\t")
+    return table
+
+
+def main():
+    fs = create_s3_target()
+    project_info = open_json(fs, "project.json")
+    for dataset in project_info["datasets"]:
+        if dataset == "fens":
+            continue
+        print(dataset)
+        dataset_info = open_json(fs, os.path.join(dataset, "dataset.json"))
+        sources = dataset_info["sources"]
+        for source, source_info in sources.items():
+            if not source.startswith("SGN"):
+                continue
+            assert "segmentation" in source_info
+            source_info = source_info["segmentation"]
+            table_path = source_info["tableData"]["tsv"]["relativePath"]
+            table = open_tsv(fs, os.path.join(dataset, table_path, "default.tsv"))
+            component_labels = table.component_labels.values
+            remaining_sgns = component_labels[component_labels != 0]
+            print(source)
+            print("Number of SGNs (all components)   :", len(remaining_sgns))
+            _, n_per_component = np.unique(remaining_sgns, return_counts=True)
+            print("Number of SGNs (largest component):", max(n_per_component))
+
+
+if __name__ == "__main__":
+    main()