Improved S3 functions, fixed issues

schilling40 · schilling40 · commit 6bf43e3659f3 · 2025-04-11T15:12:46.000+02:00
diff --git a/flamingo_tools/s3_utils.py b/flamingo_tools/s3_utils.py
@@ -3,49 +3,74 @@
 import s3fs
 import zarr
 
-from tqdm import tqdm
+"""
+This script contains utility functions for processing data located on an S3 storage.
+The upload of data to the storage system should be performed with 'rclone'.
+"""
 
-# Using incucyte s3 as a temporary measure.
-MOBIE_FOLDER = "/mnt/lustre-emmy-hdd/projects/nim00007/data/moser/lightsheet/mobie"
+# Dedicated bucket for cochlea lightsheet project
+MOBIE_FOLDER = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/mobie_project/cochlea-lightsheet"
 SERVICE_ENDPOINT = "https://s3.gwdg.de/"
-BUCKET_NAME = "incucyte-general/lightsheet"
+BUCKET_NAME = "cochlea-lightsheet"
+
+DEFAULT_CREDENTIALS = os.path.expanduser("~/.aws/credentials")
 
 # For MoBIE:
 # https://s3.gwdg.de/incucyte-general/lightsheet
 
-def check_s3_credentials(bucket_name, service_endpoint, credentials):
+def check_s3_credentials(bucket_name, service_endpoint, credential_file):
     """
     Check if S3 parameter and credentials were set either as a function input or were exported as environment variables.
     """
     if bucket_name is None:
         bucket_name = os.getenv('BUCKET_NAME')
         if bucket_name is None:
-            raise ValueError("Provide a bucket name for accessing S3 data.\nEither by using an optional argument or exporting an environment variable:\n--s3_bucket_name <bucket_name>\nexport BUCKET_NAME=<bucket_name>")
+            if BUCKET_NAME in globals():
+                bucket_name = BUCKET_NAME
+            else:
+                raise ValueError("Provide a bucket name for accessing S3 data.\nEither by using an optional argument or exporting an environment variable:\n--s3_bucket_name <bucket_name>\nexport BUCKET_NAME=<bucket_name>")
 
     if service_endpoint is None:
         service_endpoint = os.getenv('SERVICE_ENDPOINT')
         if service_endpoint is None:
-            raise ValueError("Provide a service endpoint for accessing S3 data.\nEither by using an optional argument or exporting an environment variable:\n--s3_service_endpoint <endpoint>\nexport SERVICE_ENDPOINT=<endpoint>")
+            if SERVICE_ENDPOINT in globals():
+                service_endpoint = SERVICE_ENDPOINT
+            else:
+                raise ValueError("Provide a service endpoint for accessing S3 data.\nEither by using an optional argument or exporting an environment variable:\n--s3_service_endpoint <endpoint>\nexport SERVICE_ENDPOINT=<endpoint>")
 
-    if credentials is None:
+    if credential_file is None:
         access_key = os.getenv('AWS_ACCESS_KEY_ID')
         secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
+
+        # check for default credentials if no credential_file is provided
         if access_key is None:
-            raise ValueError("Either provide a credential file as an optional argument or export an access key as an environment variable:\nexport AWS_ACCESS_KEY_ID=<access_key>")
+            if os.path.isfile(DEFAULT_CREDENTIALS):
+                access_key, _ = read_s3_credentials(credential_file=DEFAULT_CREDENTIALS)
+            else:
+                raise ValueError(f"Either provide a credential file as an optional argument, have credentials at '{DEFAULT_CREDENTIALS}', or export an access key as an environment variable:\nexport AWS_ACCESS_KEY_ID=<access_key>")
         if secret_key is None:
-            raise ValueError("Either provide a credential file as an optional argument or export a secret access key as an environment variable:\nexport AWS_SECRET_ACCESS_KEY=<secret_key>")
+            # check for default credentials
+            if os.path.isfile(DEFAULT_CREDENTIALS):
+                _, secret_key = read_s3_credentials(credential_file=DEFAULT_CREDENTIALS)
+            else:
+                raise ValueError(f"Either provide a credential file as an optional argument, have credentials at '{DEFAULT_CREDENTIALS}', or export a secret access key as an environment variable:\nexport AWS_SECRET_ACCESS_KEY=<secret_key>")
 
-    return bucket_name, service_endpoint, credentials
+    else:
+        # check validity of credential file
+        _, _ = read_s3_credentials(credential_file=credential_file)
 
+    return bucket_name, service_endpoint, credential_file
 
 def get_s3_path(
     input_path,
-    bucket_name, service_endpoint,
+    bucket_name=None, service_endpoint=None,
     credential_file=None,
 ):
     """
     Get S3 path for a file or folder and file system based on S3 parameters and credentials.
     """
+    bucket_name, service_endpoint, credential_file = check_s3_credentials(bucket_name, service_endpoint, credential_file)
+
     fs = create_s3_target(url=service_endpoint, anon=False, credential_file=credential_file)
 
     zarr_path=f"{bucket_name}/{input_path}"
@@ -84,24 +109,3 @@ def create_s3_target(url, anon=False, credential_file=None):
     else:
         fs = s3fs.S3FileSystem(anon=anon, client_kwargs=client_kwargs)
     return fs
-
-
-def upload_data():
-    target = create_s3_target(
-        SERVICE_ENDPOINT,
-        credential_file="./credentials.incucyte"
-    )
-    to_upload = []
-    for root, dirs, files in os.walk(MOBIE_FOLDER):
-        dirs.sort()
-        for ff in files:
-            if ff.endswith(".xml"):
-                to_upload.append(os.path.join(root, ff))
-
-    print("Uploading", len(to_upload), "files to")
-
-    for path in tqdm(to_upload):
-        rel_path = os.path.relpath(path, MOBIE_FOLDER)
-        target.put(
-            path, os.path.join(BUCKET_NAME, rel_path)
-        )
diff --git a/flamingo_tools/segmentation/unet_prediction.py b/flamingo_tools/segmentation/unet_prediction.py
@@ -12,6 +12,7 @@
 import torch
 import z5py
 import zarr
+import tifffile
 import json
 
 from elf.wrapper import ThresholdWrapper, SimpleTransformationWrapper
@@ -59,15 +60,18 @@ def prediction_impl(input_path, input_key, output_folder, model_path, scale, blo
     image_mask = z5py.File(mask_path, "r")["mask"]
 
     if input_key is None:
-        input_ = imageio.imread(input_path)
-        chunks = (64, 64, 64)
-    elif s3 is not None:
+        try:
+            input_ = tifffile.memmap(input_path, mode="r")
+        except ValueError:
+            print(f"Could not memmap the data from {input_path}. Fall back to load it into memory.")
+            input_ = imageio.imread(input_path)
+    elif isinstance(input_path, str):
+        input_ = open_file(input_path, "r")[input_key]
+    else:
         with zarr.open(input_path, mode="r") as f:
             input_ = f[input_key]
-        chunks = input_.chunks()
-    else:
-        input_ = open_file(input_path, "r")[input_key]
-        chunks = (64, 64, 64)
+
+    chunks = getattr(input_, "chunks", (64,64,64))
 
     if scale is None or scale == 1:
         original_shape = None
@@ -157,16 +161,19 @@ def find_mask(input_path, input_key, output_folder, s3=None):
         return
 
     if input_key is None:
-        raw = imageio.imread(input_path)
-        chunks = (64, 64, 64)
-    elif s3 is not None:
-        with zarr.open(input_path, mode="r") as fin:
-            raw = fin[input_key]
-        chunks = raw.chunks
-    else:
+        try:
+            raw = tifffile.memmap(input_path, mode="r")
+        except ValueError:
+            print(f"Could not memmap the data from {input_path}. Fall back to load it into memory.")
+            raw = imageio.imread(input_path)
+    elif isinstance(input_path, str):
         fin = open_file(input_path, "r")
         raw = fin[input_key]
-        chunks = (64, 64, 64)
+    else:
+        with zarr.open(input_path, mode="r") as fin:
+            raw = fin[input_key]
+
+    chunks = getattr(raw, "chunks", (64,64,64))
 
     block_shape = tuple(2 * ch for ch in chunks)
     blocking = nt.blocking([0, 0, 0], raw.shape, block_shape)
@@ -318,9 +325,7 @@ def run_unet_prediction_preprocess_slurm(
     and stored in a JSON file within the output folder as mean_std.json.
     """
     if s3 is not None:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
-
-        input_path, fs = s3_utils.get_s3_path(input_path, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        input_path, fs = s3_utils.get_s3_path(input_path, bucket_name=s3_bucket_name, service_endpoint=s3_service_endpoint, credential_file=s3_credentials)
 
     if not os.path.isdir(os.path.join(output_folder, "mask.zarr")):
         find_mask(input_path, input_key, output_folder, s3=s3)
@@ -355,9 +360,7 @@ def run_unet_prediction_slurm(
     slurm_task_id = os.environ.get("SLURM_ARRAY_TASK_ID")
 
     if s3 is not None:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
-
-        input_path, fs = s3_utils.get_s3_path(input_path, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        input_path, fs = s3_utils.get_s3_path(input_path, bucket_name=s3_bucket_name, service_endpoint=s3_service_endpoint, credential_file=s3_credentials)
 
     if slurm_task_id is not None:
         slurm_task_id = int(slurm_task_id)
diff --git a/scripts/extract_block.py b/scripts/extract_block.py
@@ -66,9 +66,7 @@ def main(
     roi = tuple(slice(co - rh, co + rh) for co, rh in zip(coords, roi_halo))
 
     if s3:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
-
-        s3_path, fs = s3_utils.get_s3_path(input_file, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        s3_path, fs = s3_utils.get_s3_path(input_file, bucket_name=s3_bucket_name, service_endpoint=s3_service_endpoint, credential_file=s3_credentials)
 
         with zarr.open(s3_path, mode="r") as f:
             raw = f[input_key][roi]
diff --git a/scripts/prediction/count_cells.py b/scripts/prediction/count_cells.py
@@ -28,9 +28,7 @@ def main():
         raise ValueError("Either provide an output_folder containing 'segmentation.zarr' or an S3 input.")
 
     if args.s3_input is not None:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(args.s3_bucket_name, args.s3_service_endpoint, args.s3_credentials)
-
-        s3_path, fs = s3_utils.get_s3_path(args.s3_input, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        s3_path, fs = s3_utils.get_s3_path(args.s3_input, bucket_name=args.s3_bucket_name, service_endpoint=args.s3_service_endpoint, credential_file=args.s3_credentials)
         with zarr.open(s3_path, mode="r") as f:
             dataset = f[args.input_key]
 
diff --git a/scripts/prediction/expand_seg_table.py b/scripts/prediction/expand_seg_table.py
@@ -19,8 +19,7 @@ def main(
     :param str s3_service_endpoint: S3 service endpoint. Optional if SERVICE_ENDPOINT has been exported
     """
     if s3:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(s3_bucket_name, s3_service_endpoint, s3_credentials)
-        tsv_path, fs = s3_utils.get_s3_path(in_path, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        tsv_path, fs = s3_utils.get_s3_path(in_path, bucket_name=s3_bucket_name, service_endpoint=s3_service_endpoint, credential_file=s3_credentials)
         with fs.open(tsv_path, 'r') as f:
             tsv_table = pd.read_csv(f, sep="\t")
     else:
diff --git a/scripts/prediction/postprocess_seg.py b/scripts/prediction/postprocess_seg.py
@@ -37,14 +37,12 @@ def main():
     tsv_table=None
 
     if args.s3_input is not None:
-        bucket_name, service_endpoint, credentials = s3_utils.check_s3_credentials(args.s3_bucket_name, args.s3_service_endpoint, args.s3_credentials)
-
-        s3_path, fs = s3_utils.get_s3_path(args.s3_input, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+        s3_path, fs = s3_utils.get_s3_path(args.s3_input, bucket_name=args.s3_bucket_name, service_endpoint=args.s3_service_endpoint, credential_file=args.s3_credentials)
         with zarr.open(s3_path, mode="r") as f:
             segmentation = f[args.input_key]
 
         if args.tsv is not None:
-            tsv_path, fs = s3_utils.get_s3_path(args.tsv, bucket_name=bucket_name, service_endpoint=service_endpoint, credential_file=credentials)
+            tsv_path, fs = s3_utils.get_s3_path(args.tsv, bucket_name=args.s3_bucket_name, service_endpoint=args.s3_service_endpoint, credential_file=args.s3_credentials)
             with fs.open(tsv_path, 'r') as f:
                 tsv_table = pd.read_csv(f, sep="\t")