refactor: simplified

anna-grim · web-flow · commit 5103ca7ae3fd · 2025-07-28T10:05:57.000-07:00
diff --git a/src/aind_exaspim_image_compression/utils/img_util.py b/src/aind_exaspim_image_compression/utils/img_util.py
@@ -10,19 +10,24 @@
 
 from bm4d import bm4d
 from concurrent.futures import ThreadPoolExecutor
+from imagecodecs.numcodecs import Jpegxl
 from itertools import product
 from numcodecs import Blosc, register_codec
 from ome_zarr.writer import write_multiscale
 from scipy.ndimage import uniform_filter
+from typing import Any
 from xarray_multiscale import multiscale, windowed_mode
 
 import gcsfs
 import matplotlib.pyplot as plt
 import numpy as np
+import os
 import s3fs
 import tifffile
 import zarr
 
+from aind_exaspim_image_compression.utils import util
+
 
 # --- Image Reader ---
 def read(img_path):
@@ -68,6 +73,7 @@ def _read_zarr(img_path):
     np.ndarray
         Loaded image volume.
     """
+    register_codec(Jpegxl)
     if _is_gcs_path(img_path):
         fs = gcsfs.GCSFileSystem(anon=False)
         store = zarr.storage.FSStore(img_path, fs=fs)
@@ -442,7 +448,7 @@ def compress_patch(idx):
 
 
 def compress_and_decompress_jpeg(
-    img, codec, patch_shape=(128, 128, 64), max_workers=32
+    img, codec, patch_shape=(32, 256, 256), max_workers=32
 ):
     # Helper routine
     def process_patch(idx):
@@ -491,12 +497,11 @@ def init_ome_zarr(
     compressor=Blosc(cname="zstd", clevel=5, shuffle=Blosc.SHUFFLE),
 ):
     # Setup output store
-    register_codec(compressor)
     store = zarr.DirectoryStore(output_path, dimension_separator="/")
     zgroup = zarr.group(store=store)
 
     # Create top-level dataset
-    print("Creating ome-zarr image with shape:", img.shape)
+    print("Creating OMEZARR Image with Shape:", img.shape)
     output_zarr = zgroup.create_dataset(
         name=0,
         shape=img.shape,
@@ -526,7 +531,6 @@ def write_ome_zarr(
     pyramid = [level.data for level in pyramid]
 
     # Prepare Zarr store
-    register_codec(compressor)
     store = zarr.DirectoryStore(output_path, dimension_separator="/")
     zgroup = zarr.open(store=store, mode="w")
 
diff --git a/src/aind_exaspim_image_compression/utils/util.py b/src/aind_exaspim_image_compression/utils/util.py
@@ -253,59 +253,52 @@ def copy_gcs_directory(bucket_name, source_prefix, destination_prefix):
         bucket.copy_blob(blob, bucket, new_blob_name)
 
 
-def get_gcs_directory_size(bucket_name, prefix):
+def find_subprefix_with_keyword(bucket_name, prefix, keyword):
     """
-    Calculates the total size of all objects under a given prefix in a GCS
-    bucket.
+    Finds the first GCS subprefix under a given prefix that contains a
+    specified keyword.
 
     Parameters
     ----------
     bucket_name : str
         Name of the GCS bucket.
     prefix : str
-        Path prefix within the bucket.
+        The prefix to search under.
+    keyword : str
+        Keyword to look for within the subprefixes.
 
     Returns
     -------
-    float
-        Total size in gigabytes (GB) of all objects under the given prefix.
+    str
+        First subprefix containing the keyword.
     """
-    # Download blobs
-    client = storage.Client()
-    bucket = client.bucket(bucket_name)
-    blobs = client.list_blobs(bucket, prefix=prefix)
-
-    # Compute size of blobs
-    total_size = 0
-    for blob in blobs:
-        total_size += blob.size
-
-    return total_size / 1024 ** 3
+    for subprefix in list_gcs_subprefixes(bucket_name, prefix):
+        if keyword in subprefix:
+            return subprefix
+    raise Exception(f"Prefix with keyword '{keyword}' not found in {prefix}")
 
 
-def find_subprefix_with_keyword(bucket_name, prefix, keyword):
+def get_gcs_directory_size(bucket_name, prefix):
     """
-    Finds the first GCS subprefix under a given prefix that contains a
-    specified keyword.
+    Calculate the total size of a GCS "directory" (i.e., objects under a prefix),
+    and return it in gigabytes (GB).
 
     Parameters
     ----------
     bucket_name : str
         Name of the GCS bucket.
     prefix : str
-        The prefix to search under.
-    keyword : str
-        Keyword to look for within the subprefixes.
+        GCS path prefix (e.g., 'my_folder/' to list everything under that directory).
 
     Returns
     -------
-    str
-        First subprefix containing the keyword.
+    float
+        Total size in gigabytes.
     """
-    for subprefix in list_gcs_subprefixes(bucket_name, prefix):
-        if keyword in subprefix:
-            return subprefix
-    raise Exception(f"Prefix with keyword '{keyword}' not found in {prefix}")
+    client = storage.Client()
+    bucket = client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=prefix)
+    return sum(blob.size for blob in blobs) / (1024 ** 3)
 
 
 def list_block_paths(brain_id):
@@ -427,9 +420,135 @@ def upload_directory_to_gcs(bucket_name, source_dir, destination_dir):
 
 
 # --- S3 utils ---
+def exists_in_prefix(bucket_name, prefix, name):
+    """
+    Checks if a given filename is in a prefix.
+
+    Parameters
+    ----------
+    bucket_name : str
+        Name of the S3 bucket to search.
+    prefix : str
+        S3 prefix to search within.
+    name : str
+        Filename to search for.
+
+    Returns
+    -------
+    bool
+        Indiciation of whether a given file is in a prefix.
+    """
+    prefixes = list_s3_prefixes(bucket_name, prefix)
+    return sum([1 for prefix in prefixes if name in prefix]) > 0
+
+
+def list_s3_prefixes(bucket_name, prefix):
+    """
+    Lists all immediate subdirectories of a given S3 path (prefix).
+
+    Parameters
+    -----------
+    bucket_name : str
+        Name of the S3 bucket to search.
+    prefix : str
+        S3 prefix to search within.
+
+    Returns
+    -------
+    List[str]
+        List of immediate subdirectories under the specified prefix.
+    """
+    # Check prefix is valid
+    if not prefix.endswith("/"):
+        prefix += "/"
+
+    # Call the list_objects_v2 API
+    s3 = boto3.client("s3")
+    response = s3.list_objects_v2(
+        Bucket=bucket_name, Prefix=prefix, Delimiter="/"
+    )
+    if "CommonPrefixes" in response:
+        return [cp["Prefix"] for cp in response["CommonPrefixes"]]
+    else:
+        return list()
+
+
+def list_s3_bucket_prefixes(bucket_name, keyword=None):
+    """
+    Lists all top-level prefixes (directories) in an S3 bucket, optionally
+    filtering by a keyword.
+
+    Parameters
+    -----------
+    bucket_name : str
+        Name of the S3 bucket to search.
+    keyword : str, optional
+        Keyword used to filter the prefixes. Default is None.
+
+    Returns
+    --------
+    prefixes : List[str]
+        A list of top-level prefixes (directories) in the S3 bucket. If a
+        keyword is provided, only the matching prefixes are returned.
+    """
+    # Initializations
+    prefixes = list()
+    continuation_token = None
+    s3 = boto3.client("s3")
+
+    # Main
+    keyword = keyword.lower()
+    while True:
+        # Call the list_objects_v2 API
+        list_kwargs = {"Bucket": bucket_name, "Delimiter": "/"}
+        if continuation_token:
+            list_kwargs["ContinuationToken"] = continuation_token
+        response = s3.list_objects_v2(**list_kwargs)
+
+        # Collect the top-level prefixes
+        if "CommonPrefixes" in response:
+            for prefix in response["CommonPrefixes"]:
+                if keyword and keyword in prefix["Prefix"].lower():
+                    prefixes.append(prefix["Prefix"])
+                elif keyword is None:
+                    prefixes.append(prefix["Prefix"])
+
+        # Check if there are more pages to fetch
+        if response.get("IsTruncated"):
+            continuation_token = response.get("NextContinuationToken")
+        else:
+            break
+    return prefixes
+
+
+def is_file_in_prefix(bucket_name, prefix, filename):
+    """
+    Checks if a specific file exists within a given S3 prefix.
+
+    Parameters
+    ----------
+    bucket_name : str
+        Name of the S3 bucket to searched.
+    prefix : str
+        S3 prefix (path) under which to look for the file.
+    filename : str
+        Name of the file to search for within the specified prefix.
+
+    Returns
+    -------
+    bool
+        Returns "True" if the file exists within the given prefix,
+        otherwise "False".
+    """
+    for sub_prefix in list_s3_prefixes(bucket_name, prefix):
+        if filename in sub_prefix:
+            return True
+    return False
+
+
 def write_to_s3(local_path, bucket_name, prefix):
     """
-    Writes a single file on local machine to an S3 bucket.
+    Writes a single file on local machine to an s3 bucket.
 
     Parameters
     ----------