feat: tools for reading from s3 (#139)

anna-grim · anna-grim · web-flow · commit 33dd804b8b28 · 2025-09-11T18:24:55.000-07:00
Co-authored-by: anna-grim &lt;anna.grim@alleninstitute.org&gt;
diff --git a/src/segmentation_skeleton_metrics/utils/swc_util.py b/src/segmentation_skeleton_metrics/utils/swc_util.py
@@ -86,41 +86,45 @@ def read(self, swc_pointer):
                 - "filename": filename of SWC file
                 - "swc_id": name of SWC file, minus the ".swc".
         """
-        # Dictionary with GCS specs
-        if isinstance(swc_pointer, dict):
-            return self.read_from_gcs(swc_pointer)
-
-        # List of paths to SWC files
+        # List of local paths to SWC files
         if isinstance(swc_pointer, list):
             return self.read_from_paths(swc_pointer)
 
         # Directory containing...
         if os.path.isdir(swc_pointer):
-            # ZIP archives with SWC files
+            # Local ZIP archives with SWC files
             paths = util.list_paths(swc_pointer, extension=".zip")
             if len(paths) > 0:
                 return self.read_from_zips(swc_pointer)
 
-            # SWC files
+            # Local SWC files
             paths = util.read_paths(swc_pointer, extension=".swc")
             if len(paths) > 0:
                 return self.read_from_paths(paths)
 
-            raise Exception("Directory is invalid!")
+            raise Exception("Directory is Invalid!")
 
         # Path to...
         if isinstance(swc_pointer, str):
-            # ZIP archive with SWC files
+            # Cloud GCS storage
+            if util.is_gcs_path(swc_pointer):
+                return self.read_from_gcs(swc_pointer)
+
+            # Cloud S3 storage
+            if util.is_s3_path(swc_pointer):
+                return self.read_from_s3(swc_pointer)
+
+            # Local ZIP archive with SWC files
             if swc_pointer.endswith(".zip"):
                 return self.read_from_zip(swc_pointer)
 
-            # Path to single SWC file
+            # Local path to single SWC file
             if swc_pointer.endswith(".swc"):
                 return self.read_from_path(swc_pointer)
 
-            raise Exception("Path is invalid!")
+            raise Exception("Path is Invalid!")
 
-        raise Exception("SWC Pointer is inValid!")
+        raise Exception("SWC Pointer is Invalid!")
 
     def read_from_path(self, path):
         """
@@ -268,15 +272,17 @@ def read_from_zipped_file(self, zipfile, path):
         filename = os.path.basename(path)
         return self.parse(content, filename)
 
-    def read_from_gcs(self, gcs_dict):
+    def read_from_gcs(self, gcs_path):
         """
         Reads SWC files stored in a GCS bucket.
 
         Parameters
         ----------
-        gcs_dict : dict
-            Dictionary with the keys "bucket_name" and "path" that specify
-            where the SWC files are located in a GCS bucket.
+        gcs_path : str
+            Path to location in a GCS bucket that the SWC files are stored.
+            The path must be in the format "gs://{bucket_name}/{prefix}",
+            where "prefix" is a path to a directory containing SWC files or
+            ZIP archives containing SWC files
 
         Returns
         -------
@@ -285,17 +291,18 @@ def read_from_gcs(self, gcs_dict):
             names and values from an SWC file.
         """
         # List filenames
-        swc_paths = util.list_gcs_filenames(gcs_dict, ".swc")
-        zip_paths = util.list_gcs_filenames(gcs_dict, ".zip")
+        bucket_name, prefix = parse_cloud_path(gcs_path)
+        swc_paths = util.list_gcs_filenames(bucket_name, prefix, ".swc")
+        zip_paths = util.list_gcs_filenames(bucket_name, prefix, ".zip")
 
         # Call reader
         if len(swc_paths) > 0:
-            return self.read_from_gcs_swcs(gcs_dict["bucket_name"], swc_paths)
+            return self.read_from_gcs_swcs(bucket_name, swc_paths)
         if len(zip_paths) > 0:
-            return self.read_from_gcs_zips(gcs_dict["bucket_name"], zip_paths)
+            return self.read_from_gcs_zips(bucket_name, zip_paths)
 
         # Error
-        raise Exception(f"GCS Pointer is invalid -{gcs_dict}-")
+        raise Exception(f"GCS Pointer is invalid -{gcs_path}-")
 
     def read_from_gcs_swcs(self, bucket_name, swc_paths):
         """
@@ -419,6 +426,17 @@ def read_from_gcs_zip(self, bucket_name, path):
                         swc_dicts.append(result)
         return swc_dicts
 
+    def read_from_s3_swcs(self, s3_path):
+        # List filenames
+        bucket_name, prefix = parse_cloud_path(s3_path)
+        swc_paths = util.list_s3_paths(bucket_name, prefix, extension=".swc")
+
+        # Parse SWC files
+        swc_dicts = deque()
+        for path in swc_paths:
+            contents = util.read_txt_from_s3(bucket_name, path)
+        return swc_dicts
+
     def confirm_read(self, filename):
         """
         Checks whether the swc_id corresponding to the given filename is
@@ -523,7 +541,38 @@ def read_voxel(self, xyz_str, offset):
         return img_util.to_voxels(xyz, self.anisotropy)
 
 
-# --- Write ---
+# --- Helpers ---
+def parse_cloud_path(path):
+    """
+    Parses a cloud storage path into its bucket name and key/prefix. Supports
+    paths of the form: "{scheme}://bucket_name/prefix" or without a scheme.
+
+    Parameters
+    ----------
+    path : str
+        Path to be parsed.
+
+    Returns
+    -------
+    bucket_name : str
+        Name of the bucket.
+    prefix : str
+        Cloud prefix.
+    """
+    # Remove s3:// if present
+    if path.startswith("s3://"):
+        path = path[len("s3://"):]
+
+    # Remove gs:// if present
+    if path.startswith("gs://"):
+        path = path[len("gs://"):]
+
+    parts = path.split("/", 1)
+    bucket_name = parts[0]
+    prefix = parts[1] if len(parts) > 1 else ""
+    return bucket_name, prefix
+
+
 def to_zipped_point(zip_writer, filename, xyz):
     """
     Writes a point to an SWC file format, which is then stored in a ZIP
diff --git a/src/segmentation_skeleton_metrics/utils/util.py b/src/segmentation_skeleton_metrics/utils/util.py
@@ -16,6 +16,9 @@
 from xlwt import Workbook
 from zipfile import ZipFile
 
+import boto3
+from botocore import UNSIGNED
+from botocore.client import Config
 import os
 import pandas as pd
 import shutil
@@ -153,7 +156,24 @@ def update_txt(path, text):
         file.write(text + "\n")
 
 
-# -- GCS utils --
+# -- GCS Utils --
+def is_gcs_path(path):
+    """
+    Checks if the path is a GCS path.
+
+    Parameters
+    ----------
+    path : str
+        Path to be checked.
+
+    Returns
+    -------
+    bool
+        Indication of whether the path is a GCS path.
+    """
+    return path.startswith("gs://")
+
+
 def list_files_in_zip(zip_content):
     """
     Lists all files in a zip file stored in a GCS bucket.
@@ -172,14 +192,16 @@ def list_files_in_zip(zip_content):
         return zip_file.namelist()
 
 
-def list_gcs_filenames(gcs_dict, extension):
+def list_gcs_filenames(bucket_name, prefix, extension):
     """
     Lists all files in a GCS bucket with the given extension.
 
     Parameters
     ----------
-    gcs_dict : dict
-        ...
+    bucket_name : str
+        Name of bucket to be searched.
+    prefix : str
+        Path to location within bucket to be searched.
     extension : str
         File extension of filenames to be listed.
 
@@ -188,8 +210,8 @@ def list_gcs_filenames(gcs_dict, extension):
     List[str]
         Filenames stored at "cloud" path with the given extension.
     """
-    bucket = storage.Client().bucket(gcs_dict["bucket_name"])
-    blobs = bucket.list_blobs(prefix=gcs_dict["path"])
+    bucket = storage.Client().bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=prefix)
     return [blob.name for blob in blobs if extension in blob.name]
 
 
@@ -227,16 +249,16 @@ def list_gcs_subdirectories(bucket_name, prefix):
     return subdirs
 
 
-def read_txt_from_gcs(bucket_name, filename):
+def read_txt_from_gcs(bucket_name, path):
     """
     Reads a txt file stored in a GCS bucket.
 
     Parameters
     ----------
     bucket_name : str
         Name of bucket to be read from.
-    filename : str
-        Name of txt file to be read.
+    path : str
+        Path to txt file to be read.
 
     Returns
     -------
@@ -277,6 +299,79 @@ def upload_directory_to_gcs(bucket_name, source_dir, destination_dir):
             blob.upload_from_filename(local_path)
 
 
+# --- S3 Utils ---
+def is_s3_path(path):
+    """
+    Checks if the given path is an S3 path.
+
+    Parameters
+    ----------
+    path : str
+        Path to be checked.
+
+    Returns
+    -------
+    bool
+        Indication of whether the path is an S3 path.
+    """
+    return path.startswith("s3://")
+
+
+def list_s3_paths(bucket_name, prefix, extension=""):
+    """
+    Lists all object keys in a public S3 bucket under a given prefix,
+    optionally filters by file extension.
+
+    Parameters
+    ----------
+    bucket_name : str
+        Name of the S3 bucket.
+    prefix : str
+        The S3 "directory" prefix to search under.
+    extension : str, optional
+        File extension to filter by. Default is an empty string, which returns
+        all files.
+
+    Returns
+    -------
+    List[str]
+        List of S3 object keys that match the prefix and extension filter.
+    """
+    # Create an anonymous client for public buckets
+    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+
+    # List all objects under the prefix
+    filenames = list()
+    if "Contents" in response:
+        for obj in response["Contents"]:
+            filename = obj["Key"]
+            if filename.endswith(extension):
+                filenames.append(filename)
+    return filenames
+
+
+def read_txt_from_s3(bucket_name, path):
+    """
+    Reads a txt file stored in an S3 bucket.
+
+    Parameters
+    ----------
+    bucket_name : str
+        Name of bucket to be read from.
+    path : str
+        Path to txt file to be read.
+
+    Returns
+    -------
+    str
+        Contents of txt file.
+    """
+    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+    obj = s3.get_object(Bucket=bucket_name, Key=path)
+    return obj['Body'].read().decode('utf-8').splitlines()
+
+
 # --- Miscellaneous ---
 def get_segment_id(filename):
     """