Feat s3 loading (#174)

anna-grim · anna-grim · web-flow · commit 9e37f39a0dff · 2026-02-05T14:42:30.000-08:00
* refactor: improved txt reader

* remove print

* feat: load swcs from s3

---------

Co-authored-by: anna-grim &lt;anna.grim@alleninstitute.org&gt;
diff --git a/src/segmentation_skeleton_metrics/data_handling/graph_loading.py b/src/segmentation_skeleton_metrics/data_handling/graph_loading.py
@@ -588,7 +588,7 @@ def build_labels_graph(self, connections_path):
         labels_graph.add_nodes_from(self.valid_labels)
 
         # Main
-        for line in util.read_txt(connections_path):
+        for line in util.read_txt(connections_path).splitlines():
             ids = line.split(",")
             id_1 = util.get_segment_id(ids[0])
             id_2 = util.get_segment_id(ids[1])
diff --git a/src/segmentation_skeleton_metrics/data_handling/swc_loading.py b/src/segmentation_skeleton_metrics/data_handling/swc_loading.py
@@ -18,6 +18,8 @@
       these attributes in the same order.
 """
 
+from botocore import UNSIGNED
+from botocore.client import Config
 from collections import deque
 from concurrent.futures import (
     as_completed,
@@ -29,6 +31,7 @@
 from tqdm import tqdm
 from zipfile import ZipFile
 
+import boto3
 import numpy as np
 import os
 
@@ -433,6 +436,33 @@ def read_from_s3(self, s3_path):
         """
         Reads and parses SWC files from an S3 directory.
 
+        Parameters
+        ----------
+        s3_path : str
+            Path to a directory in an S3 bucket containing SWC files or ZIPs
+            of SWC files to be read.
+
+        Returns
+        -------
+        swc_dicts : Dequeue[dict]
+            Dictionaries whose keys and values are the attribute names and
+            values from an SWC file.
+        """
+        # List filenames
+        bucket_name, prefix = util.parse_cloud_path(s3_path)
+        swc_paths = util.list_s3_filenames(bucket_name, prefix, ".swc")
+        zip_paths = util.list_s3_filenames(bucket_name, prefix, ".zip")
+
+        # Call reader
+        if len(swc_paths) > 0:
+            return self.read_from_s3_swcs(bucket_name, swc_paths)
+        if len(zip_paths) > 0:
+            return self.read_from_s3_zips(bucket_name, zip_paths)
+
+    def read_from_s3_swcs(self, bucket_name, swc_paths):
+        """
+        Reads and parses SWC files from an S3 directory.
+
         Parameters
         ----------
         s3_path : str
@@ -459,6 +489,86 @@ def read_from_s3(self, s3_path):
                 swc_dicts.append(result)
         return swc_dicts
 
+    def read_from_s3_zips(self, bucket_name, zip_paths):
+        """
+        Reads SWC files stored in a list of ZIP archives stored in an S3
+        bucket.
+
+        Parameters
+        ----------
+        bucket_name : str
+            Name of bucket containing SWC files.
+        zip_paths : str
+            Path to ZIP archive containing SWC files to be read.
+
+        Returns
+        -------
+        swc_dicts : Dequeue[dict]
+            Dictionaries whose keys and values are the attribute names and
+            values from an SWC file.
+        """
+        with ProcessPoolExecutor() as executor:
+            # Submit processes
+            processes = list()
+            for zip_path in zip_paths:
+                processes.append(
+                    executor.submit(
+                        self.read_from_s3_zip, bucket_name, zip_path
+                    )
+                )
+
+            # Store results
+            pbar = tqdm(total=len(processes), desc="Read SWCs")
+            swc_dicts = deque()
+            for process in as_completed(processes):
+                result = process.result()
+                if result:
+                    swc_dicts.extend(result)
+        return swc_dicts
+
+    def read_from_s3_zip(self, bucket_name, path):
+        """
+        Reads SWC files stored in a ZIP archive downloaded from an S3
+        bucket.
+
+        Parameters
+        ----------
+        bucket_name : str
+            Name of bucket containing SWC files.
+        path : str
+            Path to ZIP archive containing SWC files to be read.
+
+        Returns
+        -------
+        swc_dicts : Dequeue[dict]
+            Dictionaries whose keys and values are the attribute names and
+            values from an SWC file.
+        """
+        # Initialize cloud reader
+        s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+        zip_obj = s3.get_object(Bucket=bucket_name, Key=path)
+        zip_content = zip_obj["Body"].read()
+
+        # Parse ZIP
+        swc_dicts = deque()
+        with ZipFile(BytesIO(zip_content), "r") as zip_file:
+            with ThreadPoolExecutor() as executor:
+                # Assign threads for reading files
+                threads = [
+                    executor.submit(
+                        self.read_from_zipped_file, zip_file, filename
+                    )
+                    for filename in zip_file.namelist()
+                    if self.confirm_read(filename)
+                ]
+
+                # Collect results
+                for thread in as_completed(threads):
+                    result = thread.result()
+                    if result:
+                        swc_dicts.append(result)
+        return swc_dicts
+
     def confirm_read(self, filename):
         """
         Checks whether the swc_id corresponding to the given filename is
diff --git a/src/segmentation_skeleton_metrics/skeleton_metrics.py b/src/segmentation_skeleton_metrics/skeleton_metrics.py
@@ -922,7 +922,12 @@ def __call__(self, gt_graphs, fragment_graphs, merge_sites):
             DataFrame where the indices are the dictionary keys and values are
             stored under a column called "self.name".
         """
-        pbar = self.get_pbar(len(merge_sites.index))
+        # Check if merge sites is non-empty
+        if len(merge_sites) == 0:
+            return _
+
+        # Compute metric
+        pbar = self.get_pbar(len(merge_sites))
         pair_to_length = dict()
         for i in merge_sites.index:
             # Extract site info
diff --git a/src/segmentation_skeleton_metrics/utils/util.py b/src/segmentation_skeleton_metrics/utils/util.py
@@ -398,7 +398,7 @@ def is_s3_path(path):
     return path.startswith("s3://")
 
 
-def list_s3_paths(bucket_name, prefix, extension=""):
+def list_s3_filenames(bucket_name, prefix, extension=""):
     """
     Lists all object keys in a public S3 bucket under a given prefix,
     optionally filters by file extension.