Run Modal tests on S3 using obstore (#794)

tomwhite · web-flow · commit afbae754d655 · 2025-09-09T09:46:24.000+01:00
diff --git a/.github/workflows/modal-tests.yml b/.github/workflows/modal-tests.yml
@@ -34,13 +34,14 @@ jobs:
       - name: Install
         run: |
           python -m pip install --upgrade pip
-          python -m pip install -e '.[test,modal]'
+          python -m pip install -e '.[test]' modal obstore
 
       - name: Run tests
         run: |
           pytest -vs -k "test_modal.py or modal" --runcloud
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_REGION: us-east-1
           MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
           MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
diff --git a/cubed/runtime/executors/modal.py b/cubed/runtime/executors/modal.py
@@ -48,13 +48,12 @@
             [
                 "array-api-compat",
                 "donfig",
-                "fsspec",
                 "mypy_extensions",  # for rechunker
                 "ndindex",
                 "networkx",
+                "obstore",
                 "psutil",
                 "pytest-mock",  # TODO: only needed for tests
-                "s3fs",
                 "tenacity",
                 "toolz",
                 "zarr",
diff --git a/cubed/tests/runtime/test_modal.py b/cubed/tests/runtime/test_modal.py
@@ -25,13 +25,12 @@
     [
         "array-api-compat",
         "donfig",
-        "fsspec",
         "mypy_extensions",  # for rechunker
         "ndindex",
         "networkx",
+        "obstore",
         "psutil",
         "pytest-mock",  # TODO: only needed for tests
-        "s3fs",
         "tenacity",
         "toolz",
         "zarr",
@@ -41,7 +40,7 @@
 
 @app.function(
     image=image,
-    secrets=[modal.Secret.from_name("my-aws-secret")],
+    secrets=[modal.Secret.from_name("aws-secret-us-east-1")],
     retries=2,
     timeout=10,
     cloud="aws",
@@ -53,7 +52,7 @@ def deterministic_failure_modal(i, path=None, timing_map=None, *, name=None):
 
 @app.function(
     image=image,
-    secrets=[modal.Secret.from_name("my-aws-secret")],
+    secrets=[modal.Secret.from_name("aws-secret-us-east-1")],
     timeout=10,
     cloud="aws",
     region=region,
@@ -64,7 +63,7 @@ def deterministic_failure_modal_no_retries(i, path=None, timing_map=None, *, nam
 
 @app.function(
     image=image,
-    secrets=[modal.Secret.from_name("my-aws-secret")],
+    secrets=[modal.Secret.from_name("aws-secret-us-east-1")],
     retries=2,
     timeout=300,
     cloud="aws",
diff --git a/cubed/tests/runtime/utils.py b/cubed/tests/runtime/utils.py
@@ -1,20 +1,27 @@
-import re
+import asyncio
 import time
-from urllib.parse import urlparse
+from pathlib import Path
 
-import fsspec
+import obstore as obs
 
-from cubed.utils import join_path
+
+def path_to_store(path):
+    if isinstance(path, str):
+        if "://" not in path:
+            return obs.store.from_url(Path(path).as_uri(), mkdir=True)
+        else:
+            return obs.store.from_url(path)
+    elif isinstance(path, Path):
+        return obs.store.from_url(path.as_uri(), mkdir=True)
 
 
-def read_int_from_file(path):
-    with fsspec.open(path) as f:
-        return int(f.read())
+def read_int_from_file(store, path):
+    result = obs.get(store, path)
+    return int(result.bytes())
 
 
-def write_int_to_file(path, i):
-    with fsspec.open(path, "w") as f:
-        f.write(str(i))
+def write_int_to_file(store, path, i):
+    obs.put(store, path, bytes(str(i), encoding="UTF8"))
 
 
 def deterministic_failure(path, timing_map, i, *, default_sleep=0.01, name=None):
@@ -34,13 +41,12 @@ def deterministic_failure(path, timing_map, i, *, default_sleep=0.01, name=None)
     they will all run normally.
     """
     # increment number of invocations of this function with arg i
-    invocation_count_file = join_path(path, f"{i}")
-    fs = fsspec.open(invocation_count_file).fs
-    if fs.exists(invocation_count_file):
-        invocation_count = read_int_from_file(invocation_count_file)
-    else:
+    store = path_to_store(path)
+    try:
+        invocation_count = read_int_from_file(store, f"{i}")
+    except FileNotFoundError:
         invocation_count = 0
-    write_int_to_file(invocation_count_file, invocation_count + 1)
+    write_int_to_file(store, f"{i}", invocation_count + 1)
 
     timing_code = default_sleep
     if i in timing_map:
@@ -62,6 +68,20 @@ def deterministic_failure(path, timing_map, i, *, default_sleep=0.01, name=None)
 
 def check_invocation_counts(
     path, timing_map, n_tasks, retries=None, expected_invocation_counts_overrides=None
+):
+    asyncio.run(
+        check_invocation_counts_async(
+            path,
+            timing_map,
+            n_tasks,
+            retries=retries,
+            expected_invocation_counts_overrides=expected_invocation_counts_overrides,
+        )
+    )
+
+
+async def check_invocation_counts_async(
+    path, timing_map, n_tasks, retries=None, expected_invocation_counts_overrides=None
 ):
     expected_invocation_counts = {}
     for i in range(n_tasks):
@@ -84,16 +104,11 @@ def check_invocation_counts(
         expected_invocation_counts.update(expected_invocation_counts_overrides)
 
     # retrieve outputs concurrently, so we can test on large numbers of inputs
-    # see https://filesystem-spec.readthedocs.io/en/latest/async.html#synchronous-api
-    if re.match(r"^[a-zA-Z]:\\", str(path)):  # Windows local file
-        protocol = ""
-    else:
-        protocol = urlparse(str(path)).scheme
-    fs = fsspec.filesystem(protocol)
-    paths = [join_path(path, str(i)) for i in range(n_tasks)]
-    out = fs.cat(paths)
-    path_to_i = lambda p: int(p.rsplit("/", 1)[-1])
-    actual_invocation_counts = {path_to_i(path): int(val) for path, val in out.items()}
+    store = path_to_store(path)
+    paths = [str(i) for i in range(n_tasks)]
+    results = await asyncio.gather(*[obs.get_async(store, path) for path in paths])
+    values = await asyncio.gather(*[result.bytes_async() for result in results])
+    actual_invocation_counts = {i: int(val) for i, val in enumerate(values)}
 
     if actual_invocation_counts != expected_invocation_counts:
         for i, expected_count in expected_invocation_counts.items():
diff --git a/cubed/tests/test_array_api.py b/cubed/tests/test_array_api.py
@@ -452,7 +452,7 @@ def test_matmul_cloud(executor):
 @pytest.mark.cloud
 def test_matmul_modal(modal_executor):
     tmp_path = "s3://cubed-unittest/matmul"
-    spec = cubed.Spec(tmp_path, allowed_mem=100000)
+    spec = cubed.Spec(tmp_path, allowed_mem=100000, storage_options=dict(use_obstore=True))
 
     a = xp.asarray(
         [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
diff --git a/cubed/tests/test_executor_features.py b/cubed/tests/test_executor_features.py
@@ -150,7 +150,9 @@ def test_rich_progress_bar(spec, executor):
 def test_callbacks_modal(spec, modal_executor):
     task_counter = TaskCounter(check_timestamps=False)
     tmp_path = "s3://cubed-unittest/callbacks"
-    spec = cubed.Spec(tmp_path, allowed_mem=100000)
+    spec = cubed.Spec(
+        tmp_path, allowed_mem=100000, storage_options=dict(use_obstore=True)
+    )
 
     a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
     b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)
@@ -241,7 +243,9 @@ def test_compute_arrays_in_parallel(spec, any_executor, compute_arrays_in_parall
 @pytest.mark.parametrize("compute_arrays_in_parallel", [True, False])
 def test_compute_arrays_in_parallel_modal(modal_executor, compute_arrays_in_parallel):
     tmp_path = "s3://cubed-unittest/parallel_pipelines"
-    spec = cubed.Spec(tmp_path, allowed_mem=100000)
+    spec = cubed.Spec(
+        tmp_path, allowed_mem=100000, storage_options=dict(use_obstore=True)
+    )
 
     a = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
     b = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
@@ -290,7 +294,11 @@ def test_check_runtime_memory_dask_no_workers(spec, executor):
 @pytest.mark.cloud
 def test_check_runtime_memory_modal(spec, modal_executor):
     tmp_path = "s3://cubed-unittest/check-runtime-memory"
-    spec = cubed.Spec(tmp_path, allowed_mem="4GB")  # larger than Modal runtime memory
+    spec = cubed.Spec(
+        tmp_path,
+        allowed_mem="4GB",  # larger than Modal runtime memory
+        storage_options=dict(use_obstore=True),
+    )
     a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
     b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)
     c = xp.add(a, b)
diff --git a/cubed/tests/utils.py b/cubed/tests/utils.py
@@ -78,6 +78,7 @@
         {
             "spec.executor_options.cloud": "aws",
             "spec.executor_options.region": "us-east-1",
+            "spec.executor_options.secret": "aws-secret-us-east-1",
         }
     )
     executor_options = dict(enable_output=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,8 +82,8 @@ coiled = [
 test = [
     "cubed[diagnostics]",
     "dill",
-    "fsspec",
     "numpy_groupies",
+    "obstore",
     "pytest",
     "pytest-cov",
     "pytest-mock",

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@`
`78`	`78`	`{`
`79`	`79`	`"spec.executor_options.cloud": "aws",`
`80`	`80`	`"spec.executor_options.region": "us-east-1",`
	`81`	`+ "spec.executor_options.secret": "aws-secret-us-east-1",`
`81`	`82`	`}`
`82`	`83`	`)`
`83`	`84`	`executor_options = dict(enable_output=True)`