acquire-project · aliddell · Oct 8, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -165,7 +165,12 @@ jobs:
         run: python -m pip install ".[testing]"
 
       - name: Test Python
-        run: python -m pytest -v -k test_stream_data_to_s3
+        env:
+          ZARR_S3_ENDPOINT: ${{ env.MINIO_URL }}
+          ZARR_S3_BUCKET_NAME: ${{ env.MINIO_BUCKET }}
+          AWS_ACCESS_KEY_ID: ${{ env.MINIO_ACCESS_KEY }}
+          AWS_SECRET_ACCESS_KEY: ${{ env.MINIO_SECRET_KEY }}
+        run: python -m pytest -s -k test_stream_data_to_s3
 
 
   test-python:

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -148,6 +148,7 @@ def run_acquire_zarr_test(
                 data_type=aqz.DataType.UINT16,
             )
         ],
+        overwrite=True,
     )
 
     # Create a ZarrStream for appending frames.
@@ -156,15 +157,23 @@ def run_acquire_zarr_test(
     elapsed_times = []
 
     total_start = time.perf_counter_ns()
+    chunk = np.empty((tchunk_size, 2048, 2048), dtype=np.uint16)
     for i in range(data.shape[0]):
         start_plane = time.perf_counter_ns()
-        stream.append(data[i])
+        chunk_idx = i % tchunk_size
+        chunk[chunk_idx] = data[i]
+        if chunk_idx == tchunk_size - 1:
+            stream.append(chunk)
         elapsed = time.perf_counter_ns() - start_plane
         elapsed_times.append(elapsed)
         print(f"Acquire-zarr: Plane {i} written in {elapsed / 1e6:.3f} ms")
 
     # Close (or flush) the stream to finalize writes.
-    del stream
+    start_close = time.perf_counter_ns()
+    stream.close()
+    elapsed = time.perf_counter_ns() - start_close
+    elapsed_times.append(elapsed)
+    print(f"Acquire-zarr: Final close took {elapsed / 1e6:.3f} ms")
     total_elapsed = time.perf_counter_ns() - total_start
     tot_ms = total_elapsed / 1e6
     print(f"Acquire-zarr: Total write time: {tot_ms:.3f} ms")

diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp
@@ -888,7 +888,7 @@ class PyZarrStreamSettings
         settings_.overwrite = static_cast<int>(overwrite_);
 
         if (s3_settings_) {
-            *(settings_.s3_settings) = *(s3_settings_->settings());
+            settings_.s3_settings = s3_settings_->settings();
         }
 
         // construct array lifetime props and set up arrays
@@ -1087,7 +1087,7 @@ class PyZarrStream
         }
 
         auto buf = contiguous_data.request();
-        auto* ptr = (uint8_t*)buf.ptr;
+        auto* ptr = static_cast<uint8_t*>(buf.ptr);
 
         py::gil_scoped_release release;
 

diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py
@@ -378,7 +378,6 @@ def test_stream_data_to_filesystem(
         data[i, :, :] = i
 
     stream.append(data)
-
     stream.close()  # close the stream, flush the files
 
     chunk_size_bytes = data.dtype.itemsize
@@ -394,40 +393,40 @@ def test_stream_data_to_filesystem(
         shard_size_bytes + table_size_bytes + 4
     )  # 4 bytes for crc32c checksum
 
+    for x in range(settings.arrays[0].dimensions[-1].array_size_px):
+        for y in range(settings.arrays[0].dimensions[-2].array_size_px):
+            for z in range(settings.arrays[0].dimensions[-3].array_size_px):
+                shard_file = store_path / "test.zarr" / "0" / "c" / str(z) / str(y) / str(x)
+                assert shard_file.is_file()
+                if compression_codec is None:
+                    assert shard_file.stat().st_size == shard_size_bytes
+                else:
+                    size = shard_file.stat().st_size
+                    assert table_size_bytes < size <= shard_size_bytes
+
     group = zarr.open(settings.store_path, mode="r")
     array = group["0"]
 
     assert array.shape == data.shape
     for i in range(array.shape[0]):
-        assert np.array_equal(array[i, :, :], data[i, :, :])
+        assert np.array_equal(array[i, :, :], data[i, :, :]), f"Data mismatch at index {i}"
 
     metadata = array.metadata
+    sharding_codec = metadata.codecs[0]
     if compression_codec is not None:
         cname = (
             zblosc.BloscCname.lz4
             if compression_codec == CompressionCodec.BLOSC_LZ4
             else zblosc.BloscCname.zstd
         )
-        blosc_codec = metadata.codecs[0].codecs[1]
+
+        assert len(sharding_codec.codecs) == 2
+        blosc_codec = sharding_codec.codecs[1]
         assert blosc_codec.cname == cname
         assert blosc_codec.clevel == 1
         assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle
-
-        assert (
-                store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0"
-        ).is_file()
-        assert (
-                       store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0"
-               ).stat().st_size <= shard_size_bytes
     else:
-        assert len(metadata.codecs[0].codecs) == 1
-
-        assert (
-                store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0"
-        ).is_file()
-        assert (
-                       store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0"
-               ).stat().st_size == shard_size_bytes
+        assert len(sharding_codec.codecs) == 1  # bytes codec
 
 
 @pytest.mark.parametrize(
@@ -456,12 +455,12 @@ def test_stream_data_to_s3(
         pytest.skip("S3 settings not set")
 
     settings.store_path = f"{request.node.name}.zarr".replace("[", "").replace(
-        "]", ""
+        "]", "_"
     )
     settings.s3 = s3_settings
-    settings.data_type = np.uint16
+    settings.arrays[0].data_type = np.uint16
     if compression_codec is not None:
-        settings.compression = CompressionSettings(
+        settings.arrays[0].compression = CompressionSettings(
             compressor=Compressor.BLOSC1,
             codec=compression_codec,
             level=1,
@@ -501,18 +500,23 @@ def test_stream_data_to_s3(
         assert np.array_equal(array[i, :, :], data[i, :, :])
 
     metadata = array.metadata
+    assert len(metadata.codecs) == 1 # sharding codec
+    sharding_codec = metadata.codecs[0]
+
     if compression_codec is not None:
         cname = (
             zblosc.BloscCname.lz4
             if compression_codec == CompressionCodec.BLOSC_LZ4
             else zblosc.BloscCname.zstd
         )
-        blosc_codec = metadata.codecs[0].codecs[1]
+        assert len(sharding_codec.codecs) == 2
+
+        blosc_codec = sharding_codec.codecs[1]
         assert blosc_codec.cname == cname
         assert blosc_codec.clevel == 1
         assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle
     else:
-        assert len(metadata.codecs[0].codecs) == 1
+        assert len(sharding_codec.codecs) == 1 # bytes codec
 
     # cleanup
     s3 = s3fs.S3FileSystem(

diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt
@@ -11,8 +11,6 @@ add_library(${tgt}
         acquire.zarr.cpp
         array.dimensions.hh
         array.dimensions.cpp
-        locked.buffer.hh
-        locked.buffer.cpp
         frame.queue.hh
         frame.queue.cpp
         downsampler.hh
@@ -29,19 +27,27 @@ add_library(${tgt}
         s3.connection.cpp
         file.handle.hh
         file.handle.cpp
-        sink.hh
-        sink.cpp
-        file.sink.hh
-        file.sink.cpp
         ${PLATFORM_CPP}
-        s3.sink.hh
-        s3.sink.cpp
+        s3.object.hh
+        s3.object.cpp
         array.base.hh
         array.base.cpp
         array.hh
         array.cpp
         multiscale.array.hh
         multiscale.array.cpp
+        fs.storage.hh
+        fs.storage.cpp
+        fs.array.hh
+        fs.array.cpp
+        fs.multiscale.array.hh
+        fs.multiscale.array.cpp
+        s3.storage.hh
+        s3.storage.cpp
+        s3.array.hh
+        s3.array.cpp
+        s3.multiscale.array.hh
+        s3.multiscale.array.cpp
         plate.hh
         plate.cpp
         $<TARGET_OBJECTS:acquire-logger-obj>

diff --git a/src/streaming/acquire.zarr.cpp b/src/streaming/acquire.zarr.cpp
@@ -4,6 +4,7 @@
 #include "zarr.stream.hh"
 
 #include <bit>     // bit_ceil
+#include <cstring> // memcpy
 #include <cstdint> // uint32_t
 #include <unordered_set>
 #include <vector>

diff --git a/src/streaming/array.base.cpp b/src/streaming/array.base.cpp
@@ -6,18 +6,12 @@
 #include "multiscale.array.hh"
 
 zarr::ArrayBase::ArrayBase(std::shared_ptr<ArrayConfig> config,
-                           std::shared_ptr<ThreadPool> thread_pool,
-                           std::shared_ptr<FileHandlePool> file_handle_pool,
-                           std::shared_ptr<S3ConnectionPool> s3_connection_pool)
+                           std::shared_ptr<ThreadPool> thread_pool)
   : config_(config)
   , thread_pool_(thread_pool)
-  , s3_connection_pool_(s3_connection_pool)
-  , file_handle_pool_(file_handle_pool)
 {
     CHECK(config_);      // required
     CHECK(thread_pool_); // required
-    EXPECT(s3_connection_pool_ != nullptr || file_handle_pool_ != nullptr,
-           "Either S3 connection pool or file handle pool must be provided.");
 }
 
 std::string
@@ -31,94 +25,6 @@ zarr::ArrayBase::node_path_() const
     return key;
 }
 
-bool
-zarr::ArrayBase::make_metadata_sinks_()
-{
-    metadata_sinks_.clear();
-
-    try {
-        const auto sink_keys = metadata_keys_();
-        for (const auto& key : sink_keys) {
-            const std::string path = node_path_() + "/" + key;
-            std::unique_ptr<Sink> sink =
-              config_->bucket_name
-                ? make_s3_sink(*config_->bucket_name, path, s3_connection_pool_)
-                : make_file_sink(path, file_handle_pool_);
-
-            if (sink == nullptr) {
-                LOG_ERROR("Failed to create metadata sink for ", key);
-                return false;
-            }
-            metadata_sinks_.emplace(key, std::move(sink));
-        }
-    } catch (const std::exception& exc) {
-        LOG_ERROR("Failed to create metadata sinks: ", exc.what());
-        return false;
-    }
-
-    return true;
-}
-
-bool
-zarr::ArrayBase::write_metadata_()
-{
-    if (!make_metadata_()) {
-        LOG_ERROR("Failed to make metadata.");
-        return false;
-    }
-
-    if (!make_metadata_sinks_()) {
-        LOG_ERROR("Failed to make metadata sinks.");
-        return false;
-    }
-
-    for (const auto& [key, metadata] : metadata_strings_) {
-        const auto it = metadata_sinks_.find(key);
-        if (it == metadata_sinks_.end()) {
-            LOG_ERROR("Metadata sink not found for key: ", key);
-            return false;
-        }
-
-        auto& sink = it->second;
-        if (!sink) {
-            LOG_ERROR("Metadata sink is null for key: ", key);
-            return false;
-        }
-
-        std::span data{ reinterpret_cast<const uint8_t*>(metadata.data()),
-                        metadata.size() };
-        if (!sink->write(0, data)) {
-            LOG_ERROR("Failed to write metadata for key: ", key);
-            return false;
-        }
-    }
-
-    return true;
-}
-
-std::unique_ptr<zarr::ArrayBase>
-zarr::make_array(std::shared_ptr<ArrayConfig> config,
-                 std::shared_ptr<ThreadPool> thread_pool,
-                 std::shared_ptr<FileHandlePool> file_handle_pool,
-                 std::shared_ptr<S3ConnectionPool> s3_connection_pool)
-{
-    // create a multiscale array at the dataset root (node_key is empty) or if
-    // we have a genuine multiscale dataset
-    const auto multiscale =
-      config->node_key.empty() || config->downsampling_method.has_value();
-
-    std::unique_ptr<ArrayBase> array;
-    if (multiscale) {
-        array = std::make_unique<MultiscaleArray>(
-          config, thread_pool, file_handle_pool, s3_connection_pool);
-    } else {
-        array = std::make_unique<Array>(
-          config, thread_pool, file_handle_pool, s3_connection_pool);
-    }
-
-    return array;
-}
-
 bool
 zarr::finalize_array(std::unique_ptr<ArrayBase>&& array)
 {