Feat save merge sites (#106)

anna-grim · anna-grim · web-flow · commit 95a70d857ba9 · 2025-05-14T13:54:43.000-07:00
* refactor: improved new features

* improved new feature

* refactor: merge site save

* bug: mkdir output_dir

* feat: read zips of swcs on gcs

---------

Co-authored-by: anna-grim &lt;anna.grim@alleninstitute.org&gt;
diff --git a/src/segmentation_skeleton_metrics/utils/swc_util.py b/src/segmentation_skeleton_metrics/utils/swc_util.py
@@ -28,6 +28,7 @@
     ProcessPoolExecutor,
     ThreadPoolExecutor,
 )
+from google.cloud import storage
 from io import StringIO
 from tqdm import tqdm
 from zipfile import ZipFile
@@ -95,6 +96,10 @@ def read(self, swc_pointer):
                 - "swc_id": name of SWC file, minus the ".swc".
 
         """
+        # Dictionary with GCS specs
+        if isinstance(swc_pointer, dict):
+            return self.read_from_gcs(swc_pointer)
+
         # List of paths to SWC files
         if isinstance(swc_pointer, list):
             return self.read_from_paths(swc_pointer)
@@ -278,6 +283,97 @@ def read_from_zipped_file(self, zipfile, path):
         filename = os.path.basename(path)
         return self.parse(content, filename)
 
+    def read_from_gcs(self, gcs_dict):
+        """
+        Reads SWC files from ZIP archives stored in a GCS bucket.
+
+        Parameters
+        ----------
+        gcs_dict : dict
+            Dictionary with the keys "bucket_name" and "path" that specify
+            where the ZIP archives are located in a GCS bucket.
+
+        Returns
+        -------
+        Dequeue[dict]
+            List of dictionaries whose keys and values are the attribute
+            names and values from an SWC file.
+
+        """
+        # List filenames
+        bucket = storage.Client().bucket(gcs_dict["bucket_name"])
+        swc_paths = util.list_gcs_filenames(bucket, gcs_dict["path"], ".swc")
+        zip_paths = util.list_gcs_filenames(bucket, gcs_dict["path"], ".zip")
+
+        # Call reader
+        if len(swc_paths) > 0:
+            return self.read_from_gcs_swcs(bucket, swc_paths)
+        if len(zip_paths) > 0:
+            return self.read_from_gcs_zips(bucket, zip_paths)
+
+        # Error
+        raise Exception(f"GCS Pointer is invalid -{gcs_dict}-")
+
+    def read_from_gcs_swcs(self, bucket, swc_paths):
+        pass
+
+    def read_from_gcs_zips(self, bucket, zip_paths):
+        # Main
+        pbar = tqdm(total=len(zip_paths), desc="Read SWCs")
+        with ProcessPoolExecutor() as executor:
+            # Assign processes
+            processes = list()
+            for path in zip_paths:
+                zip_content = bucket.blob(path).download_as_bytes()
+                processes.append(
+                    executor.submit(self.read_from_gcs_zip, zip_content)
+                )
+
+            # Store results
+            swc_dicts = deque()
+            for process in as_completed(processes):
+                swc_dicts.extend(process.result())
+                pbar.update(1)
+        return swc_dicts
+
+    def read_from_gcs_zip(self, zip_content):
+        """
+        Reads SWC files stored in a ZIP archive downloaded from a GCS
+        bucket.
+
+        Parameters
+        ----------
+        zip_content : bytes
+            Content of a ZIP archive.
+
+        Returns
+        -------
+        Dequeue[dict]
+            List of dictionaries whose keys and values are the attribute
+            names and values from an SWC file.
+
+
+        """
+        with ZipFile(BytesIO(zip_content)) as zip_file:
+            with ThreadPoolExecutor() as executor:
+                # Assign threads
+                threads = list()
+                for filename in util.list_files_in_zip(zip_content):
+                    if self.confirm_read(filename):
+                        threads.append(
+                            executor.submit(
+                                self.read_from_zipped_file, zip_file, filename
+                            )
+                        )
+
+                # Process results
+                swc_dicts = deque()
+                for thread in as_completed(threads):
+                    result = thread.result()
+                    if result:
+                        swc_dicts.append(result)
+        return swc_dicts
+
     def confirm_read(self, filename):
         """
         Checks whether the swc_id corresponding to the given filename is
diff --git a/src/segmentation_skeleton_metrics/utils/util.py b/src/segmentation_skeleton_metrics/utils/util.py
@@ -167,6 +167,84 @@ def update_txt(path, text):
         file.write(text + "\n")
 
 
+# -- GCS utils --
+def list_files_in_zip(zip_content):
+    """
+    Lists all files in a zip file stored in a GCS bucket.
+
+    Parameters
+    ----------
+    zip_content : str
+        Content stored in a zip file in the form of a string of bytes.
+
+    Returns
+    -------
+    list[str]
+        List of filenames in a zip file.
+
+    """
+    with ZipFile(BytesIO(zip_content), "r") as zip_file:
+        return zip_file.namelist()
+
+
+def list_gcs_filenames(bucket, prefix, extension):
+    """
+    Lists all files in a GCS bucket with the given extension.
+
+    Parameters
+    ----------
+    bucket : google.cloud.client
+        Name of bucket to be read from.
+    prefix : str
+        Path to directory in "bucket".
+    extension : str
+        File extension of filenames to be listed.
+
+    Returns
+    -------
+    list
+        Filenames stored at "cloud" path with the given extension.
+
+    """
+    blobs = bucket.list_blobs(prefix=prefix)
+    return [blob.name for blob in blobs if extension in blob.name]
+
+
+def list_gcs_subdirectories(bucket_name, prefix):
+    """
+    Lists all direct subdirectories of a given prefix in a GCS bucket.
+
+    Parameters
+    ----------
+    bucket : str
+        Name of bucket to be read from.
+    prefix : str
+        Path to directory in "bucket".
+
+    Returns
+    -------
+    list[str]
+         List of direct subdirectories.
+
+    """
+    # Load blobs
+    storage_client = storage.Client()
+    blobs = storage_client.list_blobs(
+        bucket_name, prefix=prefix, delimiter="/"
+    )
+    [blob.name for blob in blobs]
+
+    # Parse directory contents
+    prefix_depth = len(prefix.split("/"))
+    subdirs = list()
+    for prefix in blobs.prefixes:
+        is_dir = prefix.endswith("/")
+        is_direct_subdir = len(prefix.split("/")) - 1 == prefix_depth
+        if is_dir and is_direct_subdir:
+            subdirs.append(prefix)
+    return subdirs
+
+
 # --- Miscellaneous ---
 def get_segment_id(filename):
     """