feat: merge detection on cloud (#53)

anna-grim · anna-grim · web-flow · commit 248dbda91882 · 2024-03-24T15:50:21.000-07:00
Co-authored-by: anna-grim &lt;anna.grim@alleninstitute.org&gt;
diff --git a/src/segmentation_skeleton_metrics/skeleton_metric.py b/src/segmentation_skeleton_metrics/skeleton_metric.py
@@ -18,11 +18,7 @@
 
 from segmentation_skeleton_metrics import graph_utils as gutils
 from segmentation_skeleton_metrics import split_detection, swc_utils, utils
-from segmentation_skeleton_metrics.swc_utils import (
-    get_xyz_coords,
-    save,
-    to_graph,
-)
+from segmentation_skeleton_metrics.swc_utils import save, to_graph
 
 INTERSECTION_THRESHOLD = 8
 MERGE_DIST_THRESHOLD = 40
@@ -54,8 +50,7 @@ def __init__(
         equivalent_ids=None,
         ignore_boundary_mistakes=False,
         output_dir=None,
-        pred_on_cloud=False,
-        valid_size_threshold=40.0,
+        valid_size_threshold=40,
         write_to_swc=False,
     ):
         """
@@ -66,15 +61,16 @@ def __init__(
         ----------
         pred_labels : numpy.ndarray or tensorstore.TensorStore
             Predicted segmentation mask.
-        pred_swc_paths : list[str]
-            List of paths to swc files where each file corresponds to a
-            neuron in the prediction.
         target_swc_paths : list[str]
             List of paths to swc files where each file corresponds to a
             neuron in the ground truth.
         anisotropy : list[float], optional
             Image to real-world coordinates scaling factors applied to swc
             files. The default is [1.0, 1.0, 1.0]
+        pred_swc_paths : list[str] or dict
+            If swc files are on local machine, list of paths to swc files where
+            each file corresponds to a neuron in the prediction. If swc files
+            are on cloud, then dict with keys "bucket_name" and "path".
         black_holes_xyz_id : list, optional
             ...
         black_hole_radius : float, optional
@@ -87,11 +83,10 @@ def __init__(
         output_dir : str, optional
             Path to directory that each mistake site is written to. The default
             is None.
-        pred_on_cloud : bool, optional
-            Indication of whether predicted swc files in "pred_swc_paths" are
-            on the cloud in a GCS bucket. The default is False.
-        valid_size_threshold : float, optional
-            ...
+        valid_size_threshold : int, optional
+            Threshold on the number of nodes contained in an swc file. Only swc
+            files with more than "valid_size_threshold" nodes are stored in
+            "self.valid_labels". The default is 40.
         write_to_swc : bool, optional
             Indication of whether to write mistake sites to an swc file. The
             default is False.
@@ -112,8 +107,9 @@ def __init__(
 
         # Build Graphs
         self.label_mask = pred_labels
-        self.pred_swc_paths = pred_swc_paths
-        self.init_valid_labels(valid_size_threshold)
+        self.valid_labels = swc_utils.parse(
+            pred_swc_paths, valid_size_threshold, anisotropy=anisotropy
+        )
 
         self.target_graphs = self.init_graphs(target_swc_paths, anisotropy)
         self.labeled_target_graphs = self.init_labeled_target_graphs()
@@ -124,13 +120,6 @@ def __init__(
         self.rm_spurious_intersections()
 
     # -- Initialize and Label Graphs --
-    def init_valid_labels(self, valid_size_threshold):
-        self.valid_labels = set()
-        for path in self.pred_swc_paths:
-            contents = swc_utils.read(path)
-            if len(contents) > valid_size_threshold:
-                self.valid_labels.add(int(utils.get_swc_id(path)))
-
     def init_graphs(self, paths, anisotropy):
         """
         Initializes "self.target_graphs" by iterating over "paths" which
@@ -236,12 +225,13 @@ def get_label(self, img_coord, return_node=False):
            Label of voxel at "img_coord".
 
         """
-        label = self.__read_label(img_coord)
+        # Read label
         if self.in_black_hole(img_coord):
             label = -1
-        return self.finalize_label(label, return_node)
+        else:
+            label = self.__read_label(img_coord)
 
-    def finalize_label(self, label, return_node):
+        # Validate label
         if return_node:
             return return_node, self.is_valid(label)
         else:
@@ -286,7 +276,7 @@ def is_valid(self, label):
 
         """
         if self.valid_labels:
-            if label not in self.valid_labels:
+            if label not in self.valid_labels.keys():
                 return 0
         return label
 
@@ -305,7 +295,7 @@ def rm_spurious_intersections(self):
             # Compute label intersect target_graphs
             hit_target_ids = dict()
             multi_hits = set()
-            for xyz in self.get_pred_xyz(label):
+            for xyz in self.get_pred_coords(label):
                 hat_xyz, d = self.get_projection(xyz)
                 if d < 5:
                     hits = list(self.xyz_to_id_node[hat_xyz].keys())
@@ -329,12 +319,11 @@ def rm_spurious_intersections(self):
                 elif label in self.id_to_label_nodes[target_id]:
                     self.zero_nodes(target_id, label)
 
-    def get_pred_xyz(self, label):
-        for path in self.pred_swc_paths:
-            swc_id = utils.get_swc_id(path)
-            if str(label) == swc_id:
-                return get_xyz_coords(path, anisotropy=self.anisotropy)
-        return []
+    def get_pred_coords(self, label):
+        if label in self.valid_labels.keys():
+            return self.valid_labels[label]
+        else:
+            return []
 
     # -- Final Constructor Routines --
     def init_kdtree(self):
diff --git a/src/segmentation_skeleton_metrics/swc_utils.py b/src/segmentation_skeleton_metrics/swc_utils.py
@@ -7,18 +7,107 @@
 
 """
 
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
+from zipfile import ZipFile
+
 import networkx as nx
 import numpy as np
+from google.cloud import storage
 
 from segmentation_skeleton_metrics import utils
 
 
-def read(path, cloud_read=False):
-    return read_from_cloud(path) if cloud_read else read_from_local(path)
+def parse(swc_paths, min_size, anisotropy=[1.0, 1.0, 1.0]):
+    """
+    Reads swc files and extracts the xyz coordinates.
+
+    Paramters
+    ---------
+    swc_paths : list or dict
+        If swc files are on local machine, list of paths to swc files where
+        each file corresponds to a neuron in the prediction. If swc files are
+        on cloud, then dict with keys "bucket_name" and "path".
+    min_size : int
+        Threshold on the number of nodes contained in an swc file. Only swc
+        files with more than "min_size" nodes are stored in "valid_labels".
+    anisotropy : list[float]
+        Image to World scaling factors applied to xyz coordinates to account
+        for anisotropy of the microscope.
 
+    Returns
+    -------
+    dict
+        ...
+    """
+    if type(swc_paths) == list:
+        return parse_local_paths(swc_paths, min_size, anisotropy)
+    elif type(swc_paths) == dict:
+        return parse_cloud_paths(swc_paths, min_size, anisotropy)
+    else:
+        return None
+
+
+def parse_local_paths(pred_swc_paths, min_size, anisotropy):
+    valid_labels = dict()
+    for path in pred_swc_paths:
+        contents = read_from_local(path)
+        if len(contents) > min_size:
+            swc_id = int(utils.get_swc_id(path))
+            valid_labels[swc_id] = get_coords(contents, anisotropy)
+    return valid_labels
+
+
+def parse_cloud_paths(cloud_dict, min_size, anisotropy):
+    # Initializations
+    bucket = storage.Client().bucket(cloud_dict["bucket_name"])
+    zip_paths = utils.list_gcs_filenames(bucket, cloud_dict["path"], ".zip")
+    chunk_size = int(len(zip_paths) * 0.02)
+
+    # Parse
+    cnt = 1
+    valid_labels = dict()
+    print("Downloading predicted swc files from cloud...")
+    print("# zip files:", len(zip_paths))
+    for i, path in enumerate(zip_paths):
+        valid_labels.update(download(bucket, path, min_size, anisotropy))
+        if i > cnt * chunk_size:
+            utils.progress_bar(i + 1, len(zip_paths))
+            cnt += 1
+
+    # Report Results
+    print("\n#Valid Labels:", len(valid_labels))
+    print("")
+    return valid_labels
+
+
+def download(bucket, zip_path, min_size, anisotropy):
+    zip_content = bucket.blob(zip_path).download_as_bytes()
+    with ZipFile(BytesIO(zip_content)) as zip_file:
+        with ThreadPoolExecutor() as executor:
+            # Assign threads
+            threads = []
+            for path in utils.list_files_in_gcs_zip(zip_content):
+                threads.append(
+                    executor.submit(
+                        parse_gcs_zip, zip_file, path, min_size, anisotropy
+                    )
+                )
 
-def read_from_cloud(path):
-    pass
+            # Process results
+            valid_labels = dict()
+            for thread in as_completed(threads):
+                valid_labels.update(thread.result())
+    return valid_labels
+
+
+def parse_gcs_zip(zip_file, path, min_size, anisotropy):
+    contents = read_from_cloud(zip_file, path)
+    if len(contents) > min_size:
+        swc_id = int(utils.get_swc_id(path))
+        return {swc_id: get_coords(contents, anisotropy)}
+    else:
+        return dict()
 
 
 def read_from_local(path):
@@ -40,38 +129,44 @@ def read_from_local(path):
         return file.readlines()
 
 
-def get_xyz_coords(path, anisotropy=[1.0, 1.0, 1.0]):
+def read_from_cloud(zip_file, path):
+    """
+    Reads the content of an swc file from a zip file in a GCS bucket.
+
+    """
+    with zip_file.open(path) as text_file:
+        return text_file.read().decode("utf-8").splitlines()
+
+
+def get_coords(contents, anisotropy):
     """
     Gets the xyz coords from the swc file at "path".
 
     Parameters
     ----------
     path : str
         Path to swc file to be parsed.
-    anisotropy : list[float], optional
-        Scaling factors applied to xyz coordinates to account for anisotropy
-        of the microscope. The default is [1.0, 1.0, 1.0].
+    anisotropy : list[float]
+        Image to World scaling factors applied to xyz coordinates to account
+        for anisotropy of the microscope.
 
     Returns
     -------
     numpy.ndarray
         xyz coords from an swc file.
 
     """
-    xyz_list = []
-    with open(path, "r") as f:
-        offset = [0, 0, 0]
-        for line in f.readlines():
-            if line.startswith("# OFFSET"):
-                parts = line.split()
-                offset = read_xyz(parts[2:5])
-            if not line.startswith("#"):
-                parts = line.split()
-                xyz = read_xyz(
-                    parts[2:5], anisotropy=anisotropy, offset=offset
-                )
-                xyz_list.append(xyz)
-    return np.array(xyz_list)
+    coords_list = []
+    offset = [0, 0, 0]
+    for line in contents:
+        if line.startswith("# OFFSET"):
+            parts = line.split()
+            offset = read_xyz(parts[2:5])
+        if not line.startswith("#"):
+            parts = line.split()
+            coord = read_xyz(parts[2:5], anisotropy=anisotropy, offset=offset)
+            coords_list.append(coord)
+    return np.array(coords_list)
 
 
 def read_xyz(xyz, anisotropy=[1.0, 1.0, 1.0], offset=[0, 0, 0]):
@@ -81,8 +176,8 @@ def read_xyz(xyz, anisotropy=[1.0, 1.0, 1.0], offset=[0, 0, 0]):
 
     Parameters
     ----------
-    xyz : str
-        (x,y,z) coordinates.
+    coord : str
+        xyz coordinate.
     anisotropy : list[float], optional
         Image to real-world coordinates scaling factors applied to "xyz". The
         default is [1.0, 1.0, 1.0].
@@ -178,7 +273,7 @@ def to_graph(path, anisotropy=[1.0, 1.0, 1.0]):
     """
     graph = nx.Graph(swc_id=utils.get_swc_id(path))
     offset = [0, 0, 0]
-    for line in read(path):
+    for line in read_from_local(path):
         if line.startswith("# OFFSET"):
             parts = line.split()
             offset = read_xyz(parts[2:5])
diff --git a/src/segmentation_skeleton_metrics/utils.py b/src/segmentation_skeleton_metrics/utils.py