From 30c4e10593f1a4f815064cee7df3d5688b150b09 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Wed, 8 Oct 2025 16:13:03 -0400 Subject: [PATCH 01/38] Remove V2 tests --- tests/integration/CMakeLists.txt | 28 +- ...=> stream-2d-multiscale-to-filesystem.cpp} | 0 ...=> stream-3d-multiscale-to-filesystem.cpp} | 0 ...pp => stream-compressed-to-filesystem.cpp} | 0 ...-to-s3.cpp => stream-compressed-to-s3.cpp} | 0 ...pend.cpp => stream-multi-frame-append.cpp} | 0 ... stream-multiple-arrays-to-filesystem.cpp} | 0 ... => stream-multiscale-trivial-3rd-dim.cpp} | 0 ...p => stream-named-array-to-filesystem.cpp} | 0 ...to-s3.cpp => stream-named-array-to-s3.cpp} | 0 ...ystem.cpp => stream-raw-to-filesystem.cpp} | 0 ...-v3-raw-to-s3.cpp => stream-raw-to-s3.cpp} | 0 ...tream-zarr-v2-compressed-to-filesystem.cpp | 374 ------------- .../stream-zarr-v2-compressed-to-s3.cpp | 507 ------------------ ...ream-zarr-v2-named-array-to-filesystem.cpp | 289 ---------- .../stream-zarr-v2-named-array-to-s3.cpp | 404 -------------- .../stream-zarr-v2-raw-to-filesystem.cpp | 355 ------------ .../integration/stream-zarr-v2-raw-to-s3.cpp | 483 ----------------- tests/unit-tests/CMakeLists.txt | 10 +- ...ay-write-even.cpp => array-write-even.cpp} | 0 ....cpp => array-write-ragged-append-dim.cpp} | 0 ...pp => array-write-ragged-internal-dim.cpp} | 0 tests/unit-tests/v2-array-write-even.cpp | 181 ------- .../v2-array-write-frame-to-chunks.cpp | 75 --- .../v2-array-write-ragged-append-dim.cpp | 150 ------ .../v2-array-write-ragged-internal-dim.cpp | 168 ------ 26 files changed, 14 insertions(+), 3010 deletions(-) rename tests/integration/{stream-zarr-v3-2d-multiscale-to-filesystem.cpp => stream-2d-multiscale-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-3d-multiscale-to-filesystem.cpp => stream-3d-multiscale-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-compressed-to-filesystem.cpp => stream-compressed-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-compressed-to-s3.cpp => stream-compressed-to-s3.cpp} (100%) rename tests/integration/{stream-zarr-v3-multi-frame-append.cpp => stream-multi-frame-append.cpp} (100%) rename tests/integration/{stream-zarr-v3-multiple-arrays-to-filesystem.cpp => stream-multiple-arrays-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-multiscale-trivial-3rd-dim.cpp => stream-multiscale-trivial-3rd-dim.cpp} (100%) rename tests/integration/{stream-zarr-v3-named-array-to-filesystem.cpp => stream-named-array-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-named-array-to-s3.cpp => stream-named-array-to-s3.cpp} (100%) rename tests/integration/{stream-zarr-v3-raw-to-filesystem.cpp => stream-raw-to-filesystem.cpp} (100%) rename tests/integration/{stream-zarr-v3-raw-to-s3.cpp => stream-raw-to-s3.cpp} (100%) delete mode 100644 tests/integration/stream-zarr-v2-compressed-to-filesystem.cpp delete mode 100644 tests/integration/stream-zarr-v2-compressed-to-s3.cpp delete mode 100644 tests/integration/stream-zarr-v2-named-array-to-filesystem.cpp delete mode 100644 tests/integration/stream-zarr-v2-named-array-to-s3.cpp delete mode 100644 tests/integration/stream-zarr-v2-raw-to-filesystem.cpp delete mode 100644 tests/integration/stream-zarr-v2-raw-to-s3.cpp rename tests/unit-tests/{v3-array-write-even.cpp => array-write-even.cpp} (100%) rename tests/unit-tests/{v3-array-write-ragged-append-dim.cpp => array-write-ragged-append-dim.cpp} (100%) rename tests/unit-tests/{v3-array-write-ragged-internal-dim.cpp => array-write-ragged-internal-dim.cpp} (100%) delete mode 100644 tests/unit-tests/v2-array-write-even.cpp delete mode 100644 tests/unit-tests/v2-array-write-frame-to-chunks.cpp delete mode 100644 tests/unit-tests/v2-array-write-ragged-append-dim.cpp delete mode 100644 tests/unit-tests/v2-array-write-ragged-internal-dim.cpp diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index 16785e0c..5b50264b 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -1,23 +1,17 @@ set(project acquire-zarr) set(tests - stream-zarr-v2-raw-to-filesystem - stream-zarr-v2-named-array-to-filesystem - stream-zarr-v2-compressed-to-filesystem - stream-zarr-v2-raw-to-s3 - stream-zarr-v2-named-array-to-s3 - stream-zarr-v2-compressed-to-s3 - stream-zarr-v3-raw-to-filesystem - stream-zarr-v3-named-array-to-filesystem - stream-zarr-v3-compressed-to-filesystem - stream-zarr-v3-2d-multiscale-to-filesystem - stream-zarr-v3-3d-multiscale-to-filesystem - stream-zarr-v3-raw-to-s3 - stream-zarr-v3-named-array-to-s3 - stream-zarr-v3-compressed-to-s3 - stream-zarr-v3-multi-frame-append - stream-zarr-v3-multiscale-trivial-3rd-dim - stream-zarr-v3-multiple-arrays-to-filesystem + stream-raw-to-filesystem + stream-named-array-to-filesystem + stream-compressed-to-filesystem + stream-2d-multiscale-to-filesystem + stream-3d-multiscale-to-filesystem + stream-raw-to-s3 + stream-named-array-to-s3 + stream-compressed-to-s3 + stream-multi-frame-append + stream-multiscale-trivial-3rd-dim + stream-multiple-arrays-to-filesystem estimate-memory-usage stream-pure-hcs-acquisition stream-mixed-flat-and-hcs-acquisition diff --git a/tests/integration/stream-zarr-v3-2d-multiscale-to-filesystem.cpp b/tests/integration/stream-2d-multiscale-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-2d-multiscale-to-filesystem.cpp rename to tests/integration/stream-2d-multiscale-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-3d-multiscale-to-filesystem.cpp b/tests/integration/stream-3d-multiscale-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-3d-multiscale-to-filesystem.cpp rename to tests/integration/stream-3d-multiscale-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-compressed-to-filesystem.cpp b/tests/integration/stream-compressed-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-compressed-to-filesystem.cpp rename to tests/integration/stream-compressed-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-compressed-to-s3.cpp b/tests/integration/stream-compressed-to-s3.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-compressed-to-s3.cpp rename to tests/integration/stream-compressed-to-s3.cpp diff --git a/tests/integration/stream-zarr-v3-multi-frame-append.cpp b/tests/integration/stream-multi-frame-append.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-multi-frame-append.cpp rename to tests/integration/stream-multi-frame-append.cpp diff --git a/tests/integration/stream-zarr-v3-multiple-arrays-to-filesystem.cpp b/tests/integration/stream-multiple-arrays-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-multiple-arrays-to-filesystem.cpp rename to tests/integration/stream-multiple-arrays-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-multiscale-trivial-3rd-dim.cpp b/tests/integration/stream-multiscale-trivial-3rd-dim.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-multiscale-trivial-3rd-dim.cpp rename to tests/integration/stream-multiscale-trivial-3rd-dim.cpp diff --git a/tests/integration/stream-zarr-v3-named-array-to-filesystem.cpp b/tests/integration/stream-named-array-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-named-array-to-filesystem.cpp rename to tests/integration/stream-named-array-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-named-array-to-s3.cpp b/tests/integration/stream-named-array-to-s3.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-named-array-to-s3.cpp rename to tests/integration/stream-named-array-to-s3.cpp diff --git a/tests/integration/stream-zarr-v3-raw-to-filesystem.cpp b/tests/integration/stream-raw-to-filesystem.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-raw-to-filesystem.cpp rename to tests/integration/stream-raw-to-filesystem.cpp diff --git a/tests/integration/stream-zarr-v3-raw-to-s3.cpp b/tests/integration/stream-raw-to-s3.cpp similarity index 100% rename from tests/integration/stream-zarr-v3-raw-to-s3.cpp rename to tests/integration/stream-raw-to-s3.cpp diff --git a/tests/integration/stream-zarr-v2-compressed-to-filesystem.cpp b/tests/integration/stream-zarr-v2-compressed-to-filesystem.cpp deleted file mode 100644 index 47494dbf..00000000 --- a/tests/integration/stream-zarr-v2-compressed-to-filesystem.cpp +++ /dev/null @@ -1,374 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include - -#include -#include -#include - -namespace fs = std::filesystem; - -namespace { -const std::string test_path = - (fs::temp_directory_path() / (TEST ".zarr")).string(); - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; -} // namespace/s - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .data_type = ZarrDataType_int32, - }; - - ZarrStreamSettings settings = { - .store_path = test_path.c_str(), - .s3_settings = nullptr, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1 - }; - - ZarrCompressionSettings compression_settings = { - .compressor = ZarrCompressor_Blosc1, - .codec = ZarrCompressionCodec_BloscZstd, - .level = 1, - .shuffle = 1, - }; - settings.arrays->compression_settings = &compression_settings; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_base_metadata(const nlohmann::json& meta) -{ - const auto multiscales = meta["multiscales"][0]; - const auto ngff_version = multiscales["version"].get(); - EXPECT(ngff_version == "0.4", - "Expected version to be '0.4', but got '", - ngff_version, - "'"); - - const auto axes = multiscales["axes"]; - EXPECT_EQ(size_t, axes.size(), 5); - std::string name, type, unit; - - name = axes[0]["name"]; - type = axes[0]["type"]; - EXPECT(name == "t", "Expected name to be 't', but got '", name, "'"); - EXPECT(type == "time", "Expected type to be 'time', but got '", type, "'"); - EXPECT(!axes[0].contains("unit"), - "Expected unit to be missing, got ", - axes[0]["unit"].get()); - - name = axes[1]["name"]; - type = axes[1]["type"]; - EXPECT(name == "c", "Expected name to be 'c', but got '", name, "'"); - EXPECT( - type == "channel", "Expected type to be 'channel', but got '", type, "'"); - EXPECT(!axes[1].contains("unit"), - "Expected unit to be missing, got ", - axes[1]["unit"].get()); - - name = axes[2]["name"]; - type = axes[2]["type"]; - unit = axes[2]["unit"]; - EXPECT(name == "z", "Expected name to be 'z', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "millimeter", - "Expected unit to be 'millimeter', but got '", - unit, - "'"); - - name = axes[3]["name"]; - type = axes[3]["type"]; - unit = axes[3]["unit"]; - EXPECT(name == "y", "Expected name to be 'y', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - name = axes[4]["name"]; - type = axes[4]["type"]; - unit = axes[4]["unit"]; - EXPECT(name == "x", "Expected name to be 'x', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - const auto datasets = multiscales["datasets"][0]; - const std::string path = datasets["path"].get(); - EXPECT(path == "0", "Expected path to be '0', but got '", path, "'"); - - const auto coordinate_transformations = - datasets["coordinateTransformations"][0]; - - type = coordinate_transformations["type"].get(); - EXPECT( - type == "scale", "Expected type to be 'scale', but got '", type, "'"); - - const auto scale = coordinate_transformations["scale"]; - EXPECT_EQ(size_t, scale.size(), 5); - EXPECT_EQ(double, scale[0].get(), 1.0); - EXPECT_EQ(double, scale[1].get(), 1.0); - EXPECT_EQ(double, scale[2].get(), 1.4); - EXPECT_EQ(double, scale[3].get(), 0.9); - EXPECT_EQ(double, scale[4].get(), 0.9); -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == "(); - EXPECT(compressor_id == "blosc", - "Expected compressor id to be blosc, but got ", - compressor_id); - - const auto cname = compressor["cname"].get(); - EXPECT( - cname == "zstd", "Expected compressor cname to be zstd, but got ", cname); - - const auto clevel = compressor["clevel"].get(); - EXPECT_EQ(int, clevel, 1); - - const auto shuffle = compressor["shuffle"].get(); - EXPECT_EQ(int, shuffle, 1); -} - -void -verify_file_data() -{ - const auto expected_file_size = chunk_width * chunk_height * chunk_planes * - chunk_channels * chunk_timepoints * - nbytes_px; - - fs::path data_root = fs::path(test_path) / "0"; - - CHECK(fs::is_directory(data_root)); - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root / std::to_string(t); - CHECK(fs::is_directory(t_dir)); - - for (auto c = 0; c < chunks_in_c; ++c) { - const auto c_dir = t_dir / std::to_string(c); - CHECK(fs::is_directory(c_dir)); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = c_dir / std::to_string(z); - CHECK(fs::is_directory(z_dir)); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir / std::to_string(y); - CHECK(fs::is_directory(y_dir)); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir / std::to_string(x); - CHECK(fs::is_regular_file(x_file)); - const auto file_size = fs::file_size(x_file); - EXPECT(file_size < expected_file_size, - "Expected file size < ", - expected_file_size, - " for file ", - x_file.string(), - ", got ", - file_size); - } - - CHECK(!fs::is_regular_file(y_dir / - std::to_string(chunks_in_x))); - } - - CHECK(!fs::is_directory(z_dir / std::to_string(chunks_in_y))); - } - - CHECK(!fs::is_directory(c_dir / std::to_string(chunks_in_z))); - } - - CHECK(!fs::is_directory(t_dir / std::to_string(chunks_in_c))); - } - - CHECK(!fs::is_directory(data_root / std::to_string(chunks_in_t))); -} - -void -verify() -{ - CHECK(std::filesystem::is_directory(test_path)); - - { - fs::path base_metadata_path = fs::path(test_path) / ".zattrs"; - std::ifstream f(base_metadata_path); - nlohmann::json base_metadata = nlohmann::json::parse(f); - - verify_base_metadata(base_metadata); - } - - { - fs::path group_metadata_path = fs::path(test_path) / ".zgroup"; - std::ifstream f = std::ifstream(group_metadata_path); - nlohmann::json group_metadata = nlohmann::json::parse(f); - - verify_group_metadata(group_metadata); - } - - { - fs::path array_metadata_path = fs::path(test_path) / "0" / ".zarray"; - std::ifstream f = std::ifstream(array_metadata_path); - nlohmann::json array_metadata = nlohmann::json::parse(f); - - verify_array_metadata(array_metadata); - } - - verify_file_data(); -} - -int -main() -{ - Zarr_set_log_level(ZarrLogLevel_Debug); - - auto* stream = setup(); - std::vector frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", - i, - ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify(); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: ", e.what()); - } - - // cleanup - if (fs::exists(test_path)) { - fs::remove_all(test_path); - } - - return retval; -} diff --git a/tests/integration/stream-zarr-v2-compressed-to-s3.cpp b/tests/integration/stream-zarr-v2-compressed-to-s3.cpp deleted file mode 100644 index 2af46c2b..00000000 --- a/tests/integration/stream-zarr-v2-compressed-to-s3.cpp +++ /dev/null @@ -1,507 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include -#include - -#include - -namespace { -std::string s3_endpoint, s3_bucket_name, s3_access_key_id, s3_secret_access_key, - s3_region; - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; - -bool -get_credentials() -{ - char* env = nullptr; - if (!(env = std::getenv("ZARR_S3_ENDPOINT"))) { - LOG_ERROR("ZARR_S3_ENDPOINT not set."); - return false; - } - s3_endpoint = env; - - if (!(env = std::getenv("ZARR_S3_BUCKET_NAME"))) { - LOG_ERROR("ZARR_S3_BUCKET_NAME not set."); - return false; - } - s3_bucket_name = env; - - if (!(env = std::getenv("AWS_ACCESS_KEY_ID"))) { - LOG_ERROR("AWS_ACCESS_KEY_ID not set."); - return false; - } - s3_access_key_id = env; - - if (!(env = std::getenv("AWS_SECRET_ACCESS_KEY"))) { - LOG_ERROR("AWS_SECRET_ACCESS_KEY not set."); - return false; - } - s3_secret_access_key = env; - - env = std::getenv("ZARR_S3_REGION"); - if (env) { - s3_region = env; - } - - return true; -} - -bool -object_exists(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - return (bool)response; -} - -size_t -get_object_size(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - if (!response) { - LOG_ERROR("Failed to get object size: %s", object_name.c_str()); - return 0; - } - - return response.size; -} - -std::string -get_object_contents(minio::s3::Client& client, const std::string& object_name) -{ - std::stringstream ss; - - minio::s3::GetObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - args.datafunc = [&ss](minio::http::DataFunctionArgs args) -> bool { - ss << args.datachunk; - return true; - }; - - // Call get object. - minio::s3::GetObjectResponse resp = client.GetObject(args); - - return ss.str(); -} - -bool -remove_items(minio::s3::Client& client, - const std::vector& item_keys) -{ - std::list objects; - for (const auto& key : item_keys) { - minio::s3::DeleteObject object; - object.name = key; - objects.push_back(object); - } - - minio::s3::RemoveObjectsArgs args; - args.bucket = s3_bucket_name; - - auto it = objects.begin(); - - args.func = [&objects = objects, - &i = it](minio::s3::DeleteObject& obj) -> bool { - if (i == objects.end()) - return false; - obj = *i; - i++; - return true; - }; - - minio::s3::RemoveObjectsResult result = client.RemoveObjects(args); - for (; result; result++) { - minio::s3::DeleteError err = *result; - if (!err) { - LOG_ERROR("Failed to delete object %s: %s", - err.object_name.c_str(), - err.message.c_str()); - return false; - } - } - - return true; -} -} // namespace/s - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .data_type = ZarrDataType_int32, - }; - ZarrStreamSettings settings = { - .store_path = TEST, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1, - }; - - ZarrS3Settings s3_settings{ - .endpoint = s3_endpoint.c_str(), - .bucket_name = s3_bucket_name.c_str(), - }; - if (!s3_region.empty()) { - s3_settings.region = s3_region.c_str(); - } - - settings.s3_settings = &s3_settings; - - ZarrCompressionSettings compression_settings = { - .compressor = ZarrCompressor_Blosc1, - .codec = ZarrCompressionCodec_BloscZstd, - .level = 1, - .shuffle = 1, - }; - settings.arrays->compression_settings = &compression_settings; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_base_metadata(const nlohmann::json& meta) -{ - const auto multiscales = meta["multiscales"][0]; - const auto ngff_version = multiscales["version"].get(); - EXPECT(ngff_version == "0.4", - "Expected version to be '0.4', but got '", - ngff_version, - "'"); - - const auto axes = multiscales["axes"]; - EXPECT_EQ(size_t, axes.size(), 5); - std::string name, type, unit; - - name = axes[0]["name"]; - type = axes[0]["type"]; - EXPECT(name == "t", "Expected name to be 't', but got '", name, "'"); - EXPECT(type == "time", "Expected type to be 'time', but got '", type, "'"); - EXPECT(!axes[0].contains("unit"), - "Expected unit to be missing, got ", - axes[0]["unit"].get()); - - name = axes[1]["name"]; - type = axes[1]["type"]; - EXPECT(name == "c", "Expected name to be 'c', but got '", name, "'"); - EXPECT( - type == "channel", "Expected type to be 'channel', but got '", type, "'"); - EXPECT(!axes[1].contains("unit"), - "Expected unit to be missing, got ", - axes[1]["unit"].get()); - - name = axes[2]["name"]; - type = axes[2]["type"]; - unit = axes[2]["unit"]; - EXPECT(name == "z", "Expected name to be 'z', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "millimeter", - "Expected unit to be 'millimeter', but got '", - unit, - "'"); - - name = axes[3]["name"]; - type = axes[3]["type"]; - unit = axes[3]["unit"]; - EXPECT(name == "y", "Expected name to be 'y', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - name = axes[4]["name"]; - type = axes[4]["type"]; - unit = axes[4]["unit"]; - EXPECT(name == "x", "Expected name to be 'x', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - const auto datasets = multiscales["datasets"][0]; - const std::string path = datasets["path"].get(); - EXPECT(path == "0", "Expected path to be '0', but got '", path, "'"); - - const auto coordinate_transformations = - datasets["coordinateTransformations"][0]; - - type = coordinate_transformations["type"].get(); - EXPECT( - type == "scale", "Expected type to be 'scale', but got '", type, "'"); - - const auto scale = coordinate_transformations["scale"]; - EXPECT_EQ(size_t, scale.size(), 5); - EXPECT_EQ(double, scale[0].get(), 1.0); - EXPECT_EQ(double, scale[1].get(), 1.0); - EXPECT_EQ(double, scale[2].get(), 1.4); - EXPECT_EQ(double, scale[3].get(), 0.9); - EXPECT_EQ(double, scale[4].get(), 0.9); -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == "(); - EXPECT(compressor_id == "blosc", - "Expected compressor id to be 'blosc', but got '%s'", - compressor_id.c_str()); - - const auto cname = compressor["cname"].get(); - EXPECT(cname == "zstd", - "Expected compressor cname to be 'zstd', but got '%s'", - cname.c_str()); - - const auto clevel = compressor["clevel"].get(); - EXPECT_EQ(int, clevel, 1); - - const auto shuffle = compressor["shuffle"].get(); - EXPECT_EQ(int, shuffle, 1); -} - -void -verify_and_cleanup() -{ - - minio::s3::BaseUrl url(s3_endpoint); - url.https = s3_endpoint.starts_with("https://"); - - minio::creds::StaticProvider provider(s3_access_key_id, - s3_secret_access_key); - minio::s3::Client client(url, &provider); - - std::string base_metadata_path = TEST "/.zattrs"; - std::string group_metadata_path = TEST "/.zgroup"; - std::string array_metadata_path = TEST "/0/.zarray"; - - { - EXPECT(object_exists(client, base_metadata_path), - "Object does not exist: %s", - base_metadata_path.c_str()); - std::string contents = get_object_contents(client, base_metadata_path); - nlohmann::json base_metadata = nlohmann::json::parse(contents); - - verify_base_metadata(base_metadata); - } - - { - EXPECT(object_exists(client, group_metadata_path), - "Object does not exist: %s", - group_metadata_path.c_str()); - std::string contents = get_object_contents(client, group_metadata_path); - nlohmann::json group_metadata = nlohmann::json::parse(contents); - - verify_group_metadata(group_metadata); - } - - { - EXPECT(object_exists(client, array_metadata_path), - "Object does not exist: %s", - array_metadata_path.c_str()); - std::string contents = get_object_contents(client, array_metadata_path); - nlohmann::json array_metadata = nlohmann::json::parse(contents); - - verify_array_metadata(array_metadata); - } - - CHECK(remove_items( - client, - { base_metadata_path, group_metadata_path, array_metadata_path })); - - const auto expected_file_size = chunk_width * chunk_height * chunk_planes * - chunk_channels * chunk_timepoints * - nbytes_px; - - // verify and clean up data files - std::vector data_files; - std::string data_root = TEST "/0"; - - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root + "/" + std::to_string(t); - - for (auto c = 0; c < chunks_in_c; ++c) { - const auto c_dir = t_dir + "/" + std::to_string(c); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = c_dir + "/" + std::to_string(z); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir + "/" + std::to_string(y); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir + "/" + std::to_string(x); - EXPECT(object_exists(client, x_file), - "Object does not exist: %s", - x_file.c_str()); - const auto file_size = get_object_size(client, x_file); - EXPECT_LT(size_t, file_size, expected_file_size); - data_files.push_back(x_file); - } - - CHECK(!object_exists( - client, y_dir + "/" + std::to_string(chunks_in_x))); - } - } - } - } - - CHECK(remove_items(client, data_files)); -} - -int -main() -{ - if (!get_credentials()) { - LOG_WARNING("Failed to get credentials. Skipping test."); - return 0; - } - - Zarr_set_log_level(ZarrLogLevel_Debug); - - auto* stream = setup(); - std::vector frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", - i, - ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify_and_cleanup(); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: ", e.what()); - } - - return retval; -} diff --git a/tests/integration/stream-zarr-v2-named-array-to-filesystem.cpp b/tests/integration/stream-zarr-v2-named-array-to-filesystem.cpp deleted file mode 100644 index 28df940b..00000000 --- a/tests/integration/stream-zarr-v2-named-array-to-filesystem.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include - -#include -#include -#include - -namespace fs = std::filesystem; - -namespace { -const std::string test_path = - (fs::temp_directory_path() / (TEST ".zarr")).string(); - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; -} // namespace/s - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .output_key = "intermediate/path", - .compression_settings = nullptr, - .data_type = ZarrDataType_int32, - }; - ZarrStreamSettings settings = { - .store_path = test_path.c_str(), - .s3_settings = nullptr, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1, - }; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == " frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", - i, - ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify(); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: %s", e.what()); - } - - // Clean up - if (fs::exists(test_path)) { - fs::remove_all(test_path); - } - - return retval; -} diff --git a/tests/integration/stream-zarr-v2-named-array-to-s3.cpp b/tests/integration/stream-zarr-v2-named-array-to-s3.cpp deleted file mode 100644 index 1bd16c64..00000000 --- a/tests/integration/stream-zarr-v2-named-array-to-s3.cpp +++ /dev/null @@ -1,404 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include -#include - -#include - -namespace { -std::string s3_endpoint, s3_bucket_name, s3_access_key_id, s3_secret_access_key, - s3_region; - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; - -bool -get_credentials() -{ - char* env = nullptr; - if (!(env = std::getenv("ZARR_S3_ENDPOINT"))) { - LOG_ERROR("ZARR_S3_ENDPOINT not set."); - return false; - } - s3_endpoint = env; - - if (!(env = std::getenv("ZARR_S3_BUCKET_NAME"))) { - LOG_ERROR("ZARR_S3_BUCKET_NAME not set."); - return false; - } - s3_bucket_name = env; - - if (!(env = std::getenv("AWS_ACCESS_KEY_ID"))) { - LOG_ERROR("AWS_ACCESS_KEY_ID not set."); - return false; - } - s3_access_key_id = env; - - if (!(env = std::getenv("AWS_SECRET_ACCESS_KEY"))) { - LOG_ERROR("AWS_SECRET_ACCESS_KEY not set."); - return false; - } - s3_secret_access_key = env; - - env = std::getenv("ZARR_S3_REGION"); - if (env) { - s3_region = env; - } - - return true; -} - -bool -object_exists(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - return (bool)response; -} - -size_t -get_object_size(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - if (!response) { - LOG_ERROR("Failed to get object size: %s", object_name.c_str()); - return 0; - } - - return response.size; -} - -std::string -get_object_contents(minio::s3::Client& client, const std::string& object_name) -{ - std::stringstream ss; - - minio::s3::GetObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - args.datafunc = [&ss](minio::http::DataFunctionArgs args) -> bool { - ss << args.datachunk; - return true; - }; - - // Call get object. - minio::s3::GetObjectResponse resp = client.GetObject(args); - - return ss.str(); -} - -bool -remove_items(minio::s3::Client& client, - const std::vector& item_keys) -{ - std::list objects; - for (const auto& key : item_keys) { - minio::s3::DeleteObject object; - object.name = key; - objects.push_back(object); - } - - minio::s3::RemoveObjectsArgs args; - args.bucket = s3_bucket_name; - - auto it = objects.begin(); - - args.func = [&objects = objects, - &i = it](minio::s3::DeleteObject& obj) -> bool { - if (i == objects.end()) - return false; - obj = *i; - i++; - return true; - }; - - minio::s3::RemoveObjectsResult result = client.RemoveObjects(args); - for (; result; result++) { - minio::s3::DeleteError err = *result; - if (!err) { - LOG_ERROR("Failed to delete object %s: %s", - err.object_name.c_str(), - err.message.c_str()); - return false; - } - } - - return true; -} -} // namespace - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .output_key = "intermediate/path", - .compression_settings = nullptr, - .data_type = ZarrDataType_int32, - }; - ZarrStreamSettings settings = { - .store_path = TEST, - .s3_settings = nullptr, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1, - }; - - ZarrS3Settings s3_settings{ - .endpoint = s3_endpoint.c_str(), - .bucket_name = s3_bucket_name.c_str(), - }; - if (!s3_region.empty()) { - s3_settings.region = s3_region.c_str(); - } - - settings.s3_settings = &s3_settings; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == " data_files; - std::string data_root = TEST "/intermediate/path"; - - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root + "/" + std::to_string(t); - - for (auto c = 0; c < chunks_in_c; ++c) { - const auto c_dir = t_dir + "/" + std::to_string(c); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = c_dir + "/" + std::to_string(z); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir + "/" + std::to_string(y); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir + "/" + std::to_string(x); - EXPECT(object_exists(client, x_file), - "Object does not exist: ", - x_file); - const auto file_size = get_object_size(client, x_file); - EXPECT_EQ(size_t, file_size, expected_file_size); - data_files.push_back(x_file); - } - - CHECK(!object_exists( - client, y_dir + "/" + std::to_string(chunks_in_x))); - } - } - } - } - - CHECK(remove_items(client, data_files)); -} - -int -main() -{ - if (!get_credentials()) { - LOG_WARNING("Failed to get credentials. Skipping test."); - return 0; - } - - Zarr_set_log_level(ZarrLogLevel_Debug); - - auto* stream = setup(); - std::vector frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", - i, - ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify_and_cleanup(); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: %s", e.what()); - } - - return retval; -} diff --git a/tests/integration/stream-zarr-v2-raw-to-filesystem.cpp b/tests/integration/stream-zarr-v2-raw-to-filesystem.cpp deleted file mode 100644 index 3e154fe2..00000000 --- a/tests/integration/stream-zarr-v2-raw-to-filesystem.cpp +++ /dev/null @@ -1,355 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include - -#include -#include -#include - -namespace fs = std::filesystem; - -namespace { -const std::string test_path = - (fs::temp_directory_path() / (TEST ".zarr")).string(); - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; -} // namespace - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .compression_settings = nullptr, - .data_type = ZarrDataType_int32, - }; - ZarrStreamSettings settings = { - .store_path = test_path.c_str(), - .s3_settings = nullptr, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1 - }; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_base_metadata(const nlohmann::json& meta) -{ - const auto multiscales = meta["multiscales"][0]; - const auto ngff_version = multiscales["version"].get(); - EXPECT(ngff_version == "0.4", - "Expected version to be '0.4', but got '", - ngff_version, - "'"); - - const auto axes = multiscales["axes"]; - EXPECT_EQ(size_t, axes.size(), 5); - std::string name, type, unit; - - name = axes[0]["name"]; - type = axes[0]["type"]; - EXPECT(name == "t", "Expected name to be 't', but got '", name, "'"); - EXPECT(type == "time", "Expected type to be 'time', but got '", type, "'"); - EXPECT(!axes[0].contains("unit"), - "Expected unit to be missing, got ", - axes[0]["unit"].get()); - - name = axes[1]["name"]; - type = axes[1]["type"]; - EXPECT(name == "c", "Expected name to be 'c', but got '", name, "'"); - EXPECT( - type == "channel", "Expected type to be 'channel', but got '", type, "'"); - EXPECT(!axes[1].contains("unit"), - "Expected unit to be missing, got ", - axes[1]["unit"].get()); - - name = axes[2]["name"]; - type = axes[2]["type"]; - unit = axes[2]["unit"]; - EXPECT(name == "z", "Expected name to be 'z', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "millimeter", - "Expected unit to be 'millimeter', but got '", - unit, - "'"); - - name = axes[3]["name"]; - type = axes[3]["type"]; - unit = axes[3]["unit"]; - EXPECT(name == "y", "Expected name to be 'y', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - name = axes[4]["name"]; - type = axes[4]["type"]; - unit = axes[4]["unit"]; - EXPECT(name == "x", "Expected name to be 'x', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - const auto datasets = multiscales["datasets"][0]; - const std::string path = datasets["path"].get(); - EXPECT(path == "0", "Expected path to be '0', but got '", path, "'"); - - const auto coordinate_transformations = - datasets["coordinateTransformations"][0]; - - type = coordinate_transformations["type"].get(); - EXPECT(type == "scale", "Expected type to be 'scale', but got '", type, "'"); - - const auto scale = coordinate_transformations["scale"]; - EXPECT_EQ(size_t, scale.size(), 5); - EXPECT_EQ(double, scale[0].get(), 1.0); - EXPECT_EQ(double, scale[1].get(), 1.0); - EXPECT_EQ(double, scale[2].get(), 1.4); - EXPECT_EQ(double, scale[3].get(), 0.9); - EXPECT_EQ(double, scale[4].get(), 0.9); -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == " frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", i, ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify(); - - // Clean up - fs::remove_all(test_path); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: ", e.what()); - } - - // cleanup - if (fs::exists(test_path)) { - fs::remove_all(test_path); - } - - return retval; -} diff --git a/tests/integration/stream-zarr-v2-raw-to-s3.cpp b/tests/integration/stream-zarr-v2-raw-to-s3.cpp deleted file mode 100644 index 533e636f..00000000 --- a/tests/integration/stream-zarr-v2-raw-to-s3.cpp +++ /dev/null @@ -1,483 +0,0 @@ -#include "acquire.zarr.h" -#include "test.macros.hh" - -#include -#include - -#include - -namespace { -std::string s3_endpoint, s3_bucket_name, s3_access_key_id, s3_secret_access_key, - s3_region; - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; - -const size_t nbytes_px = sizeof(int32_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; - -bool -get_credentials() -{ - char* env = nullptr; - if (!(env = std::getenv("ZARR_S3_ENDPOINT"))) { - LOG_ERROR("ZARR_S3_ENDPOINT not set."); - return false; - } - s3_endpoint = env; - - if (!(env = std::getenv("ZARR_S3_BUCKET_NAME"))) { - LOG_ERROR("ZARR_S3_BUCKET_NAME not set."); - return false; - } - s3_bucket_name = env; - - if (!(env = std::getenv("AWS_ACCESS_KEY_ID"))) { - LOG_ERROR("AWS_ACCESS_KEY_ID not set."); - return false; - } - s3_access_key_id = env; - - if (!(env = std::getenv("AWS_SECRET_ACCESS_KEY"))) { - LOG_ERROR("AWS_SECRET_ACCESS_KEY not set."); - return false; - } - s3_secret_access_key = env; - - env = std::getenv("ZARR_S3_REGION"); - if (env) { - s3_region = env; - } - - return true; -} - -bool -object_exists(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - return (bool)response; -} - -size_t -get_object_size(minio::s3::Client& client, const std::string& object_name) -{ - minio::s3::StatObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - - minio::s3::StatObjectResponse response = client.StatObject(args); - - if (!response) { - LOG_ERROR("Failed to get object size: %s", object_name.c_str()); - return 0; - } - - return response.size; -} - -std::string -get_object_contents(minio::s3::Client& client, const std::string& object_name) -{ - std::stringstream ss; - - minio::s3::GetObjectArgs args; - args.bucket = s3_bucket_name; - args.object = object_name; - args.datafunc = [&ss](minio::http::DataFunctionArgs args) -> bool { - ss << args.datachunk; - return true; - }; - - // Call get object. - minio::s3::GetObjectResponse resp = client.GetObject(args); - - return ss.str(); -} - -bool -remove_items(minio::s3::Client& client, - const std::vector& item_keys) -{ - std::list objects; - for (const auto& key : item_keys) { - minio::s3::DeleteObject object; - object.name = key; - objects.push_back(object); - } - - minio::s3::RemoveObjectsArgs args; - args.bucket = s3_bucket_name; - - auto it = objects.begin(); - - args.func = [&objects = objects, - &i = it](minio::s3::DeleteObject& obj) -> bool { - if (i == objects.end()) - return false; - obj = *i; - i++; - return true; - }; - - minio::s3::RemoveObjectsResult result = client.RemoveObjects(args); - for (; result; result++) { - minio::s3::DeleteError err = *result; - if (!err) { - LOG_ERROR("Failed to delete object %s: %s", - err.object_name.c_str(), - err.message.c_str()); - return false; - } - } - - return true; -} -} // namespace - -ZarrStream* -setup() -{ - ZarrArraySettings array = { - .compression_settings = nullptr, - .data_type = ZarrDataType_int32, - }; - ZarrStreamSettings settings = { - .store_path = TEST, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - .arrays = &array, - .array_count = 1, - }; - - ZarrS3Settings s3_settings{ - .endpoint = s3_endpoint.c_str(), - .bucket_name = s3_bucket_name.c_str(), - }; - if (!s3_region.empty()) { - s3_settings.region = s3_region.c_str(); - } - - settings.s3_settings = &s3_settings; - - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - 0, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; - *dim = DIM("z", - ZarrDimensionType_Space, - array_planes, - chunk_planes, - 0, - "millimeter", - 1.4); - - dim = settings.arrays->dimensions + 3; - *dim = DIM("y", - ZarrDimensionType_Space, - array_height, - chunk_height, - 0, - "micrometer", - 0.9); - - dim = settings.arrays->dimensions + 4; - *dim = DIM("x", - ZarrDimensionType_Space, - array_width, - chunk_width, - 0, - "micrometer", - 0.9); - - auto* stream = ZarrStream_create(&settings); - ZarrArraySettings_destroy_dimension_array(settings.arrays); - - return stream; -} - -void -verify_base_metadata(const nlohmann::json& meta) -{ - const auto multiscales = meta["multiscales"][0]; - const auto ngff_version = multiscales["version"].get(); - EXPECT(ngff_version == "0.4", - "Expected version to be '0.4', but got '", - ngff_version, - "'"); - - const auto axes = multiscales["axes"]; - EXPECT_EQ(size_t, axes.size(), 5); - std::string name, type, unit; - - name = axes[0]["name"]; - type = axes[0]["type"]; - EXPECT(name == "t", "Expected name to be 't', but got '", name, "'"); - EXPECT(type == "time", "Expected type to be 'time', but got '", type, "'"); - EXPECT(!axes[0].contains("unit"), - "Expected unit to be missing, got ", - axes[0]["unit"].get()); - - name = axes[1]["name"]; - type = axes[1]["type"]; - EXPECT(name == "c", "Expected name to be 'c', but got '", name, "'"); - EXPECT( - type == "channel", "Expected type to be 'channel', but got '", type, "'"); - EXPECT(!axes[1].contains("unit"), - "Expected unit to be missing, got ", - axes[1]["unit"].get()); - - name = axes[2]["name"]; - type = axes[2]["type"]; - unit = axes[2]["unit"]; - EXPECT(name == "z", "Expected name to be 'z', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "millimeter", - "Expected unit to be 'millimeter', but got '", - unit, - "'"); - - name = axes[3]["name"]; - type = axes[3]["type"]; - unit = axes[3]["unit"]; - EXPECT(name == "y", "Expected name to be 'y', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - name = axes[4]["name"]; - type = axes[4]["type"]; - unit = axes[4]["unit"]; - EXPECT(name == "x", "Expected name to be 'x', but got '", name, "'"); - EXPECT( - type == "space", "Expected type to be 'space', but got '", type, "'"); - EXPECT(unit == "micrometer", - "Expected unit to be 'micrometer', but got '", - unit, - "'"); - - const auto datasets = multiscales["datasets"][0]; - const std::string path = datasets["path"].get(); - EXPECT(path == "0", "Expected path to be '0', but got '", path, "'"); - - const auto coordinate_transformations = - datasets["coordinateTransformations"][0]; - - type = coordinate_transformations["type"].get(); - EXPECT( - type == "scale", "Expected type to be 'scale', but got '", type, "'"); - - const auto scale = coordinate_transformations["scale"]; - EXPECT_EQ(size_t, scale.size(), 5); - EXPECT_EQ(double, scale[0].get(), 1.0); - EXPECT_EQ(double, scale[1].get(), 1.0); - EXPECT_EQ(double, scale[2].get(), 1.4); - EXPECT_EQ(double, scale[3].get(), 0.9); - EXPECT_EQ(double, scale[4].get(), 0.9); -} - -void -verify_group_metadata(const nlohmann::json& meta) -{ - const auto zarr_format = meta["zarr_format"].get(); - EXPECT_EQ(int, zarr_format, 2); -} - -void -verify_array_metadata(const nlohmann::json& meta) -{ - const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); - - const auto& chunks = meta["chunks"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width); - - const auto dtype = meta["dtype"].get(); - EXPECT(dtype == " data_files; - std::string data_root = TEST "/0"; - - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root + "/" + std::to_string(t); - - for (auto c = 0; c < chunks_in_c; ++c) { - const auto c_dir = t_dir + "/" + std::to_string(c); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = c_dir + "/" + std::to_string(z); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir + "/" + std::to_string(y); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir + "/" + std::to_string(x); - EXPECT(object_exists(client, x_file), - "Object does not exist: %s", - x_file.c_str()); - const auto file_size = get_object_size(client, x_file); - EXPECT_EQ(size_t, file_size, expected_file_size); - data_files.push_back(x_file); - } - - CHECK(!object_exists( - client, y_dir + "/" + std::to_string(chunks_in_x))); - } - } - } - } - - CHECK(remove_items(client, data_files)); -} - -int -main() -{ - if (!get_credentials()) { - LOG_WARNING("Failed to get credentials. Skipping test."); - return 0; - } - - Zarr_set_log_level(ZarrLogLevel_Debug); - - auto* stream = setup(); - std::vector frame(array_width * array_height, 0); - - int retval = 1; - - try { - size_t bytes_out; - for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( - stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); - EXPECT(status == ZarrStatusCode_Success, - "Failed to append frame ", i, ": ", - Zarr_get_status_message(status)); - EXPECT_EQ(size_t, bytes_out, bytes_of_frame); - } - - ZarrStream_destroy(stream); - - verify_and_cleanup(); - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: ", e.what()); - } - - return retval; -} diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt index 927a3254..9f84209c 100644 --- a/tests/unit-tests/CMakeLists.txt +++ b/tests/unit-tests/CMakeLists.txt @@ -19,13 +19,9 @@ set(tests s3-sink-write s3-sink-write-multipart make-data-sinks - v2-array-write-frame-to-chunks - v2-array-write-even - v2-array-write-ragged-append-dim - v2-array-write-ragged-internal-dim - v3-array-write-even - v3-array-write-ragged-append-dim - v3-array-write-ragged-internal-dim + array-write-even + array-write-ragged-append-dim + array-write-ragged-internal-dim zarr-stream-partial-append frame-queue downsampler diff --git a/tests/unit-tests/v3-array-write-even.cpp b/tests/unit-tests/array-write-even.cpp similarity index 100% rename from tests/unit-tests/v3-array-write-even.cpp rename to tests/unit-tests/array-write-even.cpp diff --git a/tests/unit-tests/v3-array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp similarity index 100% rename from tests/unit-tests/v3-array-write-ragged-append-dim.cpp rename to tests/unit-tests/array-write-ragged-append-dim.cpp diff --git a/tests/unit-tests/v3-array-write-ragged-internal-dim.cpp b/tests/unit-tests/array-write-ragged-internal-dim.cpp similarity index 100% rename from tests/unit-tests/v3-array-write-ragged-internal-dim.cpp rename to tests/unit-tests/array-write-ragged-internal-dim.cpp diff --git a/tests/unit-tests/v2-array-write-even.cpp b/tests/unit-tests/v2-array-write-even.cpp deleted file mode 100644 index 131f78e8..00000000 --- a/tests/unit-tests/v2-array-write-even.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "v2.array.hh" -#include "unit.test.macros.hh" -#include "zarr.common.hh" - -#include - -#include - -namespace fs = std::filesystem; - -namespace { -const fs::path base_dir = fs::temp_directory_path() / TEST; - -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; -const unsigned int n_frames = array_planes * array_channels * array_timepoints; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks - -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; // 2 chunks - -const int level_of_detail = 0; -} // namespace - -void -check_json() -{ - fs::path meta_path = base_dir / ".zarray"; - CHECK(fs::is_regular_file(meta_path)); - - std::ifstream f(meta_path); - nlohmann::json meta = nlohmann::json::parse(f); - - EXPECT(meta["dtype"].get() == "()); - - EXPECT_EQ(int, meta["zarr_format"].get(), 2); - - const auto& array_shape = meta["shape"]; - const auto& chunk_shape = meta["chunks"]; - - EXPECT_EQ(int, array_shape.size(), 5); - EXPECT_EQ(int, array_shape[0].get(), array_timepoints); - EXPECT_EQ(int, array_shape[1].get(), array_channels); - EXPECT_EQ(int, array_shape[2].get(), array_planes); - EXPECT_EQ(int, array_shape[3].get(), array_height); - EXPECT_EQ(int, array_shape[4].get(), array_width); - - EXPECT_EQ(int, chunk_shape.size(), 5); - EXPECT_EQ(int, chunk_shape[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunk_shape[1].get(), chunk_channels); - EXPECT_EQ(int, chunk_shape[2].get(), chunk_planes); - EXPECT_EQ(int, chunk_shape[3].get(), chunk_height); - EXPECT_EQ(int, chunk_shape[4].get(), chunk_width); -} - -int -main() -{ - Logger::set_log_level(LogLevel_Debug); - - int retval = 1; - - const ZarrDataType dtype = ZarrDataType_uint16; - const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - - try { - auto thread_pool = std::make_shared( - std::thread::hardware_concurrency(), - [](const std::string& err) { LOG_ERROR("Error: ", err); }); - - std::vector dims; - dims.emplace_back( - "t", ZarrDimensionType_Time, array_timepoints, chunk_timepoints, 0); - dims.emplace_back( - "c", ZarrDimensionType_Channel, array_channels, chunk_channels, 0); - dims.emplace_back( - "z", ZarrDimensionType_Space, array_planes, chunk_planes, 0); - dims.emplace_back( - "y", ZarrDimensionType_Space, array_height, chunk_height, 0); - dims.emplace_back( - "x", ZarrDimensionType_Space, array_width, chunk_width, 0); - - auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - level_of_detail); - - { - auto writer = std::make_unique( - config, - thread_pool, - std::make_shared(), - nullptr); - - const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); - - for (auto i = 0; i < n_frames; ++i) { // 2 time points - CHECK(writer->write_frame(data)); - } - - CHECK(finalize_array(std::move(writer))); - } - - check_json(); - - const auto expected_file_size = chunk_width * chunk_height * - chunk_planes * chunk_channels * - chunk_timepoints * nbytes_px; - - const fs::path data_root = base_dir; - - CHECK(fs::is_directory(data_root)); - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root / std::to_string(t); - CHECK(fs::is_directory(t_dir)); - - for (auto c = 0; c < chunks_in_c; ++c) { - const auto c_dir = t_dir / std::to_string(c); - CHECK(fs::is_directory(c_dir)); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = c_dir / std::to_string(z); - CHECK(fs::is_directory(z_dir)); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir / std::to_string(y); - CHECK(fs::is_directory(y_dir)); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir / std::to_string(x); - CHECK(fs::is_regular_file(x_file)); - const auto file_size = fs::file_size(x_file); - EXPECT_EQ(int, file_size, expected_file_size); - } - - CHECK(!fs::is_regular_file( - y_dir / std::to_string(chunks_in_x))); - } - - CHECK( - !fs::is_directory(z_dir / std::to_string(chunks_in_y))); - } - - CHECK(!fs::is_directory(c_dir / std::to_string(chunks_in_z))); - } - - CHECK(!fs::is_directory(t_dir / std::to_string(chunks_in_c))); - } - - CHECK(!fs::is_directory(data_root / std::to_string(chunks_in_t))); - - retval = 0; - } catch (const std::exception& exc) { - LOG_ERROR("Exception: ", exc.what()); - } - - // cleanup - if (fs::exists(base_dir)) { - fs::remove_all(base_dir); - } - return retval; -} \ No newline at end of file diff --git a/tests/unit-tests/v2-array-write-frame-to-chunks.cpp b/tests/unit-tests/v2-array-write-frame-to-chunks.cpp deleted file mode 100644 index 8940195f..00000000 --- a/tests/unit-tests/v2-array-write-frame-to-chunks.cpp +++ /dev/null @@ -1,75 +0,0 @@ -#include "v2.array.hh" -#include "unit.test.macros.hh" -#include "zarr.common.hh" - -#include - -namespace fs = std::filesystem; - -int -main() -{ - const auto base_dir = fs::temp_directory_path() / TEST; - int retval = 1; - - const unsigned int array_width = 64, array_height = 48, array_planes = 2, - array_channels = 1, array_timepoints = 2; - const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 1, - chunk_channels = 1, chunk_timepoints = 1; - - const unsigned int n_frames = - array_planes * array_channels * array_timepoints; - - const ZarrDataType dtype = ZarrDataType_uint16; - const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - - try { - auto thread_pool = std::make_shared( - 0, [](const std::string& err) { LOG_ERROR("Error: ", err); }); - - std::vector dims; - dims.emplace_back( - "t", ZarrDimensionType_Time, array_timepoints, chunk_timepoints, 0); - dims.emplace_back( - "c", ZarrDimensionType_Channel, array_channels, chunk_channels, 0); - dims.emplace_back( - "z", ZarrDimensionType_Space, array_planes, chunk_planes, 0); - dims.emplace_back( - "y", ZarrDimensionType_Space, array_height, chunk_height, 0); - dims.emplace_back( - "x", ZarrDimensionType_Space, array_width, chunk_width, 0); - - auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - 0); - - zarr::V2Array writer(config, - thread_pool, - std::make_shared(), - nullptr); - - const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); - - for (auto i = 0; i < n_frames; ++i) { - CHECK(writer.write_frame(data) == frame_size); - } - - retval = 0; - } catch (const std::exception& exc) { - LOG_ERROR("Exception: ", exc.what()); - } - - // cleanup - if (fs::exists(base_dir)) { - fs::remove_all(base_dir); - } - - return retval; -} \ No newline at end of file diff --git a/tests/unit-tests/v2-array-write-ragged-append-dim.cpp b/tests/unit-tests/v2-array-write-ragged-append-dim.cpp deleted file mode 100644 index 5d9c6051..00000000 --- a/tests/unit-tests/v2-array-write-ragged-append-dim.cpp +++ /dev/null @@ -1,150 +0,0 @@ -#include "v2.array.hh" -#include "unit.test.macros.hh" -#include "zarr.common.hh" - -#include -#include - -namespace fs = std::filesystem; - -namespace { -const fs::path base_dir = fs::temp_directory_path() / TEST; - -const unsigned int array_width = 64, array_height = 48, array_planes = 5; -const unsigned int n_frames = array_planes; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks - -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks, ragged - -const int level_of_detail = 1; -} // namespace - -void -check_json() -{ - fs::path zarray_path = base_dir / ".zarray"; - CHECK(fs::is_regular_file(zarray_path)); - - std::ifstream f(zarray_path); - nlohmann::json zarray = nlohmann::json::parse(f); - - EXPECT(zarray["dtype"].get() == "|u1", - "Expected dtype to be |u1, but got ", - zarray["dtype"].get().c_str()); - - EXPECT_EQ(int, zarray["zarr_format"].get(), 2); - - const auto& chunks = zarray["chunks"]; - EXPECT_EQ(int, chunks.size(), 3); - EXPECT_EQ(int, chunks[0].get(), chunk_planes); - EXPECT_EQ(int, chunks[1].get(), chunk_height); - EXPECT_EQ(int, chunks[2].get(), chunk_width); - - const auto& shape = zarray["shape"]; - EXPECT_EQ(int, shape.size(), 3); - EXPECT_EQ(int, shape[0].get(), array_planes); - EXPECT_EQ(int, shape[1].get(), array_height); - EXPECT_EQ(int, shape[2].get(), array_width); -} - -int -main() -{ - Logger::set_log_level(LogLevel_Debug); - - int retval = 1; - - const ZarrDataType dtype = ZarrDataType_uint8; - const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - - try { - auto thread_pool = std::make_shared( - 1, [](const std::string& err) { LOG_ERROR("Error: ", err); }); - - std::vector dims; - dims.emplace_back( - "z", ZarrDimensionType_Space, array_planes, chunk_planes, 0); - dims.emplace_back( - "y", ZarrDimensionType_Space, array_height, chunk_height, 0); - dims.emplace_back( - "x", ZarrDimensionType_Space, array_width, chunk_width, 0); - - auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - level_of_detail); - - { - auto writer = std::make_unique( - config, - thread_pool, - std::make_shared(), - nullptr); - - const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); - - for (auto i = 0; i < n_frames; ++i) { // 2 time points - CHECK(writer->write_frame(data)); - } - - CHECK(finalize_array(std::move(writer))); - } - - check_json(); - - const auto expected_file_size = - chunk_width * chunk_height * chunk_planes * nbytes_px; - - const fs::path data_root = base_dir; - - CHECK(fs::is_directory(data_root)); - - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = data_root / std::to_string(z); - CHECK(fs::is_directory(z_dir)); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir / std::to_string(y); - CHECK(fs::is_directory(y_dir)); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir / std::to_string(x); - CHECK(fs::is_regular_file(x_file)); - const auto file_size = fs::file_size(x_file); - EXPECT_EQ(int, file_size, expected_file_size); - } - - CHECK( - !fs::is_regular_file(y_dir / std::to_string(chunks_in_x))); - } - - CHECK(!fs::is_directory(z_dir / std::to_string(chunks_in_y))); - } - - CHECK(!fs::is_directory(data_root / std::to_string(chunks_in_z))); - - retval = 0; - } catch (const std::exception& exc) { - LOG_ERROR("Exception: ", exc.what()); - } - - // cleanup - if (fs::exists(base_dir)) { - fs::remove_all(base_dir); - } - - return retval; -} \ No newline at end of file diff --git a/tests/unit-tests/v2-array-write-ragged-internal-dim.cpp b/tests/unit-tests/v2-array-write-ragged-internal-dim.cpp deleted file mode 100644 index e3357aba..00000000 --- a/tests/unit-tests/v2-array-write-ragged-internal-dim.cpp +++ /dev/null @@ -1,168 +0,0 @@ -#include "v2.array.hh" -#include "unit.test.macros.hh" -#include "zarr.common.hh" - -#include -#include - -namespace fs = std::filesystem; - -namespace { -const fs::path base_dir = fs::temp_directory_path() / TEST; - -const unsigned int array_width = 64, array_height = 48, array_planes = 5, - array_timepoints = 5; -const unsigned int n_frames = array_planes * array_timepoints; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_timepoints = 5; - -const unsigned int chunks_in_x = - (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = - (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = - (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks, ragged -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; // 1 chunk - -const int level_of_detail = 2; -} // namespace - -void -check_json() -{ - fs::path zarray_path = base_dir / ".zarray"; - CHECK(fs::is_regular_file(zarray_path)); - - std::ifstream f(zarray_path); - nlohmann::json zarray = nlohmann::json::parse(f); - - EXPECT(zarray["dtype"].get() == "()); - - EXPECT_EQ(int, zarray["zarr_format"].get(), 2); - - const auto& chunks = zarray["chunks"]; - EXPECT_EQ(int, chunks.size(), 4); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_planes); - EXPECT_EQ(int, chunks[2].get(), chunk_height); - EXPECT_EQ(int, chunks[3].get(), chunk_width); - - const auto& shape = zarray["shape"]; - EXPECT_EQ(int, shape.size(), 4); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_planes); - EXPECT_EQ(int, shape[2].get(), array_height); - EXPECT_EQ(int, shape[3].get(), array_width); -} - -int -main() -{ - Logger::set_log_level(LogLevel_Debug); - - int retval = 1; - - const ZarrDataType dtype = ZarrDataType_float64; - const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - - try { - auto thread_pool = std::make_shared( - std::thread::hardware_concurrency(), - [](const std::string& err) { LOG_ERROR("Error: ", err); }); - - std::vector dims; - dims.emplace_back( - "t", ZarrDimensionType_Time, array_timepoints, chunk_timepoints, 0); - dims.emplace_back( - "z", ZarrDimensionType_Space, array_planes, chunk_planes, 0); - dims.emplace_back( - "y", ZarrDimensionType_Space, array_height, chunk_height, 0); - dims.emplace_back( - "x", ZarrDimensionType_Space, array_width, chunk_width, 0); - - auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - level_of_detail); - - { - auto writer = std::make_unique( - config, - thread_pool, - std::make_shared(), - nullptr); - - const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); - - for (auto i = 0; i < n_frames; ++i) { // 2 time points - CHECK(writer->write_frame(data)); - } - - CHECK(finalize_array(std::move(writer))); - } - - check_json(); - - const auto expected_file_size = chunk_width * chunk_height * - chunk_planes * chunk_timepoints * - nbytes_px; - - const fs::path data_root = base_dir; - - CHECK(fs::is_directory(data_root)); - - for (auto t = 0; t < chunks_in_t; ++t) { - const auto t_dir = data_root / std::to_string(t); - CHECK(fs::is_directory(t_dir)); - { - for (auto z = 0; z < chunks_in_z; ++z) { - const auto z_dir = t_dir / std::to_string(z); - CHECK(fs::is_directory(z_dir)); - - for (auto y = 0; y < chunks_in_y; ++y) { - const auto y_dir = z_dir / std::to_string(y); - CHECK(fs::is_directory(y_dir)); - - for (auto x = 0; x < chunks_in_x; ++x) { - const auto x_file = y_dir / std::to_string(x); - CHECK(fs::is_regular_file(x_file)); - const auto file_size = fs::file_size(x_file); - EXPECT_EQ(int, file_size, expected_file_size); - } - - CHECK(!fs::is_regular_file( - y_dir / std::to_string(chunks_in_x))); - } - - CHECK( - !fs::is_directory(z_dir / std::to_string(chunks_in_y))); - } - - CHECK(!fs::is_directory(t_dir / std::to_string(chunks_in_z))); - } - - CHECK(!fs::is_directory(data_root / std::to_string(chunks_in_t))); - } - - retval = 0; - } catch (const std::exception& exc) { - LOG_ERROR("Exception: ", exc.what()); - } - - // cleanup - if (fs::exists(base_dir)) { - fs::remove_all(base_dir); - } - - return retval; -} \ No newline at end of file From b2d95a7cbae11b1d930c6c02da9c041c2f833fa5 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 9 Oct 2025 10:13:30 -0400 Subject: [PATCH 02/38] Remove ZarrVersion and references to V2 in tests/benchmarks/examples --- README.md | 15 +- benchmarks/benchmark.cpp | 21 +- benchmarks/benchmark.py | 1 - benchmarks/run_benchmarks.py | 34 +-- examples/deprecated/README.md | 5 - ...zarrv2-compressed-multiscale-filesystem.py | 68 ------ .../deprecated/python/zarrv2-compressed-s3.py | 83 ------- .../python/zarrv2-raw-filesystem.py | 57 ----- examples/deprecated/zarrv2-compressed-s3.c | 110 ---------- examples/deprecated/zarrv2-raw-filesystem.c | 89 -------- examples/stream-compressed-multiscale-to-s3.c | 3 +- examples/stream-compressed-to-filesystem.c | 3 +- examples/stream-compressed-to-s3.c | 3 +- examples/stream-multiarray-to-filesystem.cpp | 1 - .../stream-raw-multiscale-to-filesystem.c | 3 +- examples/stream-raw-to-filesystem.c | 3 +- examples/stream-raw-to-s3.c | 3 +- include/acquire.zarr.h | 2 - include/zarr.types.h | 7 - python/acquire-zarr-py.cpp | 19 -- python/acquire_zarr/__init__.pyi | 33 --- python/tests/test_settings.py | 7 - python/tests/test_stream.py | 205 ++++-------------- setup.py | 2 +- src/streaming/array.base.cpp | 24 +- src/streaming/array.base.hh | 5 +- src/streaming/zarr.stream.cpp | 45 +--- src/streaming/zarr.stream.hh | 1 - tests/integration/estimate-memory-usage.cpp | 2 +- .../stream-2d-multiscale-to-filesystem.cpp | 1 - .../stream-3d-multiscale-to-filesystem.cpp | 1 - .../stream-compressed-to-filesystem.cpp | 1 - tests/integration/stream-compressed-to-s3.cpp | 1 - .../stream-mixed-flat-and-hcs-acquisition.cpp | 1 - .../integration/stream-multi-frame-append.cpp | 1 - .../stream-multiple-arrays-to-filesystem.cpp | 1 - .../stream-multiscale-trivial-3rd-dim.cpp | 1 - .../stream-named-array-to-filesystem.cpp | 1 - .../integration/stream-named-array-to-s3.cpp | 1 - .../stream-pure-hcs-acquisition.cpp | 1 - .../integration/stream-raw-to-filesystem.cpp | 4 +- tests/integration/stream-raw-to-s3.cpp | 1 - .../stream-with-ragged-final-shard.cpp | 1 - .../create-stream-with-metadata.cpp | 28 +-- tests/unit-tests/create-stream.cpp | 14 +- .../unit-tests/zarr-stream-partial-append.cpp | 47 ++-- 46 files changed, 113 insertions(+), 847 deletions(-) delete mode 100644 examples/deprecated/README.md delete mode 100644 examples/deprecated/python/zarrv2-compressed-multiscale-filesystem.py delete mode 100644 examples/deprecated/python/zarrv2-compressed-s3.py delete mode 100644 examples/deprecated/python/zarrv2-raw-filesystem.py delete mode 100644 examples/deprecated/zarrv2-compressed-s3.c delete mode 100644 examples/deprecated/zarrv2-raw-filesystem.c diff --git a/README.md b/README.md index ad6d3543..b082591e 100644 --- a/README.md +++ b/README.md @@ -9,14 +9,11 @@ [![PyPI - Downloads](https://img.shields.io/pypi/dm/acquire-zarr)](https://pypistats.org/packages/acquire-zarr) [![Docs](https://img.shields.io/badge/docs-stable-blue)](https://acquire-project.github.io/acquire-docs/stable/) -This library supports chunked, compressed, multiscale streaming to [Zarr][], both [version 2][] and [version 3][], with +This library supports chunked, compressed, multiscale streaming to [Zarr][] [version 3][], with [OME-NGFF metadata]. This code builds targets for Python and C. -**Note:** Zarr Version 2 is deprecated and will be removed in a future release. -We recommend using Zarr Version 3 for new projects. - **For complete documentation, please visit the [Acquire documentation site](https://acquire-project.github.io/acquire-docs/).** ## Installing @@ -147,7 +144,6 @@ array.dimensions[0] = (ZarrDimensionProperties){ ZarrStreamSettings settings = (ZarrStreamSettings){ .store_path = "my_stream.zarr", - .version = ZarrVersion_3, .overwrite = true, // Optional: remove existing data at store_path if true .arrays = &array, .array_count = 1, // Number of arrays in the stream @@ -177,7 +173,6 @@ import numpy as np settings = aqz.StreamSettings( store_path="my_stream.zarr", - version=aqz.ZarrVersion.V3, overwrite=True # Optional: remove existing data at store_path if true ) @@ -242,7 +237,6 @@ import numpy as np # configure the stream with two arrays settings = aqz.StreamSettings( store_path="experiment.zarr", - version=aqz.ZarrVersion.V3, overwrite=True, # Remove existing data at store_path if true arrays=[ aqz.ArraySettings( @@ -337,8 +331,6 @@ When set to `false`, the stream will use the existing directory if it exists, or The library supports high-content screening (HCS) datasets following the [OME-NGFF 0.5](https://ngff.openmicroscopy.org/0.5/) specification. HCS data is organized into plates, wells, and fields of view, with automatic generation of appropriate metadata. -**Note:** HCS is *not* supported for Zarr V2. - Here's an example of creating an HCS dataset in Python: ```python @@ -415,7 +407,6 @@ plate = aqz.Plate( # Create stream with HCS configuration settings = aqz.StreamSettings( store_path="hcs_experiment.zarr", - version=aqz.ZarrVersion.V3, overwrite=True, hcs_plates=[plate] ) @@ -457,7 +448,6 @@ labels_array = aqz.ArraySettings( settings = aqz.StreamSettings( store_path="mixed_experiment.zarr", - version=aqz.ZarrVersion.V3, overwrite=True, arrays=[labels_array], # Flat arrays hcs_plates=[plate] # HCS structure @@ -570,7 +560,6 @@ ZarrHCSSettings hcs_settings = { // Configure stream ZarrStreamSettings settings = { .store_path = "hcs_experiment.zarr", - .version = ZarrVersion_3, .overwrite = true, .arrays = NULL, .array_count = 0, @@ -666,8 +655,6 @@ conda install -c conda-forge libstdcxx-ng [Zarr]: https://zarr.dev/ -[version 2]: https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html - [version 3]: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html [Blosc]: https://github.com/Blosc/c-blosc diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 3afc3272..9f0fe8dd 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -22,14 +22,12 @@ struct BenchmarkConfig { BenchmarkConfig() : chunk({ 1, 1, 1, 1, 1 }) - , zarr_version(3) , chunks_per_shard_x(0) , chunks_per_shard_y(0) { } ChunkConfig chunk; - int zarr_version; std::string compression; std::string storage; unsigned int chunks_per_shard_x; @@ -68,7 +66,6 @@ setup_stream(const BenchmarkConfig& config) ZarrStreamSettings settings = { .store_path = "benchmark.zarr", .s3_settings = nullptr, - .version = static_cast(config.zarr_version), .arrays = &array, .array_count = 1, }; @@ -165,17 +162,16 @@ print_usage(const char* program_name) << "Usage: " << program_name << " [OPTIONS]\n" << "Options:\n" << " --chunk t,c,z,y,x Chunk dimensions (required)\n" - << " --version VERSION Zarr version (2 or 3, required)\n" << " --compression TYPE Compression type (none/lz4/zstd, required)\n" << " --storage TYPE Storage type (filesystem/s3, required)\n" - << " --shard-y NUM Chunks per shard Y (required for v3)\n" - << " --shard-x NUM Chunks per shard X (required for v3)\n" + << " --shard-y NUM Chunks per shard Y\n" + << " --shard-x NUM Chunks per shard X\n" << " --s3-endpoint URL S3 endpoint (required for s3 storage)\n" << " --s3-bucket NAME S3 bucket name (required for s3 storage)\n" << " --s3-access-key ID S3 access key (required for s3 storage)\n" << " --s3-secret-key KEY S3 secret key (required for s3 storage)\n\n" << "Output is written to stdout in CSV format. Values are:\n" - << " Chunk dimensions (t,c,z,y,x), Zarr version, Compression type,\n" + << " Chunk dimensions (t,c,z,y,x), Compression type,\n" << " Storage type, Chunks per shard in Y, Chunks per shard in X, Time " "(s)\n"; } @@ -222,14 +218,6 @@ main(int argc, char* argv[]) return 1; } has_chunk = true; - } else if (arg == "--version" && i + 1 < argc) { - config.zarr_version = std::stoi(argv[++i]); - if (config.zarr_version != 2 && config.zarr_version != 3) { - std::cerr << "Invalid Zarr version: " << config.zarr_version - << "\n"; - print_usage(argv[0]); - return 1; - } } else if (arg == "--compression" && i + 1 < argc) { config.compression = argv[++i]; if (config.compression != "none" && config.compression != "lz4" && @@ -295,8 +283,7 @@ main(int argc, char* argv[]) std::to_string(config.chunk.x); // Write results to stdout - std::cout << chunk_str << "," << config.zarr_version << "," - << config.compression << "," << config.storage << "," + std::cout << chunk_str << "," << config.compression << "," << config.storage << "," << config.chunks_per_shard_y << "," << config.chunks_per_shard_x << "," << std::fixed << std::setprecision(3) << time << "\n"; diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 0d3dae90..ffc90056 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -109,7 +109,6 @@ def run_acquire_zarr_test( """Write data using acquire-zarr and print per-plane and total write times.""" settings = aqz.StreamSettings( store_path=path, - version=aqz.ZarrVersion.V3, arrays=[ aqz.ArraySettings( dimensions=[ diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py index 27523508..af6a114d 100644 --- a/benchmarks/run_benchmarks.py +++ b/benchmarks/run_benchmarks.py @@ -97,37 +97,7 @@ def main(): outfile = open(f"zarr_benchmarks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", "w") # print header - write_out("chunk_size,zarr_version,compression,storage,chunks_per_shard_y,chunks_per_shard_x,time_seconds,run") - - # Run V2 benchmarks - for chunk in CHUNK_CONFIGS: - for compression in COMPRESSION_TYPES: - # Filesystem storage - cmd = [ - executable, - "--chunk", chunk, - "--version", "2", - "--compression", compression, - "--storage", "filesystem" - ] - for run in range(5): - success, output = run_benchmark(cmd) - if success and output: - write_out(output + f",{run + 1}") - - # S3 storage if configured - if s3_args: - cmd = [ - executable, - "--chunk", chunk, - "--version", "2", - "--compression", compression, - "--storage", "s3" - ] + s3_args - for run in range(5): - success, output = run_benchmark(cmd) - if success and output: - write_out(output + f",{run + 1}") + write_out("chunk_size,compression,storage,chunks_per_shard_y,chunks_per_shard_x,time_seconds,run") # Run V3 benchmarks with sharding for chunk in CHUNK_CONFIGS: @@ -149,7 +119,6 @@ def main(): cmd = [ executable, "--chunk", chunk, - "--version", "3", "--compression", compression, "--storage", "filesystem", "--shard-y", str(cps_y), @@ -165,7 +134,6 @@ def main(): cmd = [ executable, "--chunk", chunk, - "--version", "3", "--compression", compression, "--storage", "s3", "--shard-y", str(cps_y), diff --git a/examples/deprecated/README.md b/examples/deprecated/README.md deleted file mode 100644 index 5621654a..00000000 --- a/examples/deprecated/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Zarr V2 is deprecated - -These examples are for streaming Zarr V2, which is deprecated. -They are not guaranteed to work and will be removed in a future release. -Please use the new streaming Zarr V3 API instead. \ No newline at end of file diff --git a/examples/deprecated/python/zarrv2-compressed-multiscale-filesystem.py b/examples/deprecated/python/zarrv2-compressed-multiscale-filesystem.py deleted file mode 100644 index c81b6986..00000000 --- a/examples/deprecated/python/zarrv2-compressed-multiscale-filesystem.py +++ /dev/null @@ -1,68 +0,0 @@ -# Basic Zarr V2 to filesystem -import numpy as np -from acquire_zarr import ( - StreamSettings, ZarrStream, Dimension, DimensionType, ZarrVersion, DataType, Compressor, CompressionCodec, - CompressionSettings -) - - -def make_sample_data(): - return np.random.randint( - 0, 65535, - (32, 48, 64), # Shape matches chunk size for time dimension - dtype=np.int32 - ) - - -def main(): - # Configure stream settings - settings = StreamSettings() - - # Configure compression - settings.compression = CompressionSettings( - compressor=Compressor.BLOSC1, - codec=CompressionCodec.BLOSC_LZ4, - level=1, - shuffle=2, # bitshuffle - ) - - # Configure dimensions (t, y, x) - settings.dimensions.extend([ - Dimension( - name="t", - kind=DimensionType.TIME, - array_size_px=0, # Unlimited - chunk_size_px=32, - shard_size_chunks=1, - ), - Dimension( - name="y", - kind=DimensionType.SPACE, - array_size_px=48, - chunk_size_px=16, - shard_size_chunks=1, - ), - Dimension( - name="x", - kind=DimensionType.SPACE, - array_size_px=64, - chunk_size_px=32, - shard_size_chunks=1, - ), - ]) - - settings.store_path = "output_v2_multiscale.zarr" - settings.version = ZarrVersion.V2 - settings.data_type = DataType.INT32 - settings.multiscale = True - - # Create stream - stream = ZarrStream(settings) - - # Create and write sample data - for i in range(10): - stream.append(make_sample_data()) - - -if __name__ == "__main__": - main() diff --git a/examples/deprecated/python/zarrv2-compressed-s3.py b/examples/deprecated/python/zarrv2-compressed-s3.py deleted file mode 100644 index e3a3e5f3..00000000 --- a/examples/deprecated/python/zarrv2-compressed-s3.py +++ /dev/null @@ -1,83 +0,0 @@ -# Zarr V2 with ZSTD compression to S3 -import numpy as np - -# Ensure that you have set your S3 credentials in the environment variables -# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and optionally AWS_SESSION_TOKEN -# BEFORE importing acquire_zarr -from acquire_zarr import ( - StreamSettings, ZarrStream, Dimension, DimensionType, ZarrVersion, - DataType, Compressor, CompressionCodec, CompressionSettings, S3Settings -) - - -def make_sample_data(): - return np.random.randint( - 0, 65535, - (32, 3, 48, 64), # Shape matches chunk sizes - dtype=np.int32 - ) - -def main(): - settings = StreamSettings() - - # Configure S3 - settings.s3 = S3Settings( - endpoint="http://localhost:9000", - bucket_name="mybucket", - region="us-east-2" - ) - - # Configure compression - settings.compression = CompressionSettings( - compressor=Compressor.BLOSC1, - codec=CompressionCodec.BLOSC_ZSTD, - level=1, - shuffle=1, - ) - - # Configure 4D array (t, c, y, x) - settings.dimensions.extend([ - Dimension( - name="t", - kind=DimensionType.TIME, - array_size_px=0, # Unlimited - chunk_size_px=32, - shard_size_chunks=1, - ), - Dimension( - name="c", - kind=DimensionType.CHANNEL, - array_size_px=3, - chunk_size_px=3, - shard_size_chunks=1, - ), - Dimension( - name="y", - kind=DimensionType.SPACE, - array_size_px=48, - chunk_size_px=16, - shard_size_chunks=1, - ), - Dimension( - name="x", - kind=DimensionType.SPACE, - array_size_px=64, - chunk_size_px=32, - shard_size_chunks=1, - ), - ]) - - settings.store_path = "output_v2_s3.zarr" - settings.version = ZarrVersion.V2 - settings.data_type = DataType.INT32 - - # Create stream - stream = ZarrStream(settings) - - # Create and write sample data - for i in range(10): - stream.append(make_sample_data()) - - -if __name__ == "__main__": - main() diff --git a/examples/deprecated/python/zarrv2-raw-filesystem.py b/examples/deprecated/python/zarrv2-raw-filesystem.py deleted file mode 100644 index fea75288..00000000 --- a/examples/deprecated/python/zarrv2-raw-filesystem.py +++ /dev/null @@ -1,57 +0,0 @@ -# Basic Zarr V2 to filesystem -import numpy as np -from acquire_zarr import ( - StreamSettings, ZarrStream, Dimension, DimensionType, ZarrVersion, DataType -) - - -def make_sample_data(): - return np.random.randint( - 0, 65535, - (32, 48, 64), # Shape matches chunk size for time dimension - dtype=np.int32 - ) - -def main(): - # Configure stream settings - settings = StreamSettings() - - # Configure dimensions (t, y, x) - settings.dimensions.extend([ - Dimension( - name="t", - kind=DimensionType.TIME, - array_size_px=0, # Unlimited - chunk_size_px=32, - shard_size_chunks=1, - ), - Dimension( - name="y", - kind=DimensionType.SPACE, - array_size_px=48, - chunk_size_px=16, - shard_size_chunks=1, - ), - Dimension( - name="x", - kind=DimensionType.SPACE, - array_size_px=64, - chunk_size_px=32, - shard_size_chunks=1, - ), - ]) - - settings.store_path = "output_v2.zarr" - settings.version = ZarrVersion.V2 - settings.data_type = DataType.INT32 - - # Create stream - stream = ZarrStream(settings) - - # Create and write sample data - for i in range(10): - stream.append(make_sample_data()) - - -if __name__ == "__main__": - main() diff --git a/examples/deprecated/zarrv2-compressed-s3.c b/examples/deprecated/zarrv2-compressed-s3.c deleted file mode 100644 index 84605615..00000000 --- a/examples/deprecated/zarrv2-compressed-s3.c +++ /dev/null @@ -1,110 +0,0 @@ -/// @file zarrv2-compressed-s3.c -/// @brief Zarr V2 with ZSTD compression to S3 -#include "acquire.zarr.h" -#include -#include - -int main() { - // Configure S3 - // Ensure that you have set your S3 credentials in the environment variables - // AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and optionally AWS_SESSION_TOKEN - ZarrS3Settings s3 = { - .endpoint = "http://localhost:9000", - .bucket_name = "mybucket", - }; - - // Configure compression - ZarrCompressionSettings compression = { - .compressor = ZarrCompressor_Blosc1, - .codec = ZarrCompressionCodec_BloscZstd, - .level = 1, - .shuffle = 1 - }; - - // Configure stream settings - ZarrStreamSettings settings = { - .store_path = "output_v2_s3.zarr", - .s3_settings = &s3, - .compression_settings = &compression, - .data_type = ZarrDataType_int32, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - }; - - // Set up dimensions (t, c, y, x) - ZarrArraySettings_create_dimension_array(&settings.array, 4); - - settings.array.dimensions[0] = (ZarrDimensionProperties){ - .name = "t", - .type = ZarrDimensionType_Time, - .array_size_px = 0, // Unlimited - .chunk_size_px = 32, - .shard_size_chunks = 1 - }; - - settings.array.dimensions[1] = (ZarrDimensionProperties){ - .name = "c", - .type = ZarrDimensionType_Channel, - .array_size_px = 3, - .chunk_size_px = 3, - .shard_size_chunks = 1 - }; - - settings.array.dimensions[2] = (ZarrDimensionProperties){ - .name = "y", - .type = ZarrDimensionType_Space, - .array_size_px = 48, - .chunk_size_px = 16, - .shard_size_chunks = 1 - }; - - settings.array.dimensions[3] = (ZarrDimensionProperties){ - .name = "x", - .type = ZarrDimensionType_Space, - .array_size_px = 64, - .chunk_size_px = 32, - .shard_size_chunks = 1 - }; - - // Create stream - ZarrStream* stream = ZarrStream_create(&settings); - // Free Dimension array - ZarrArraySettings_destroy_dimension_array(&settings.array); - - if (!stream) { - fprintf(stderr, "Failed to create stream\n"); - return 1; - } - - // Create sample data - const size_t width = 64; - const size_t height = 48; - int32_t* frame = (int32_t*)malloc(width * height * sizeof(int32_t)); - - // Write frames - size_t bytes_written; - for (int i = 0; i < 10; i++) { - // Fill frame with sample data - for (size_t j = 0; j < width * height; j++) { - frame[j] = i * 1000 + j; - } - - ZarrStatusCode status = ZarrStream_append( - stream, - frame, - width * height * sizeof(int32_t), - &bytes_written - ); - - if (status != ZarrStatusCode_Success) { - fprintf(stderr, "Failed to append frame: %s\n", - Zarr_get_status_message(status)); - break; - } - } - - // Cleanup - free(frame); - ZarrStream_destroy(stream); - return 0; -} diff --git a/examples/deprecated/zarrv2-raw-filesystem.c b/examples/deprecated/zarrv2-raw-filesystem.c deleted file mode 100644 index 92b20feb..00000000 --- a/examples/deprecated/zarrv2-raw-filesystem.c +++ /dev/null @@ -1,89 +0,0 @@ -/// @file zarrv2-raw-filesystem.c -/// @brief Basic Zarr V2 streaming to filesystem -#include "acquire.zarr.h" -#include -#include - -int main() { - // Configure stream settings - ZarrStreamSettings settings = { - .store_path = "output_v2.zarr", - .s3_settings = NULL, - .compression_settings = NULL, - .data_type = ZarrDataType_int32, - .version = ZarrVersion_2, - .max_threads = 0, // use all available threads - }; - - // Set up dimensions (t, y, x) - ZarrArraySettings_create_dimension_array(&settings.array, 3); - - // Time dimension - unlimited size (0) - settings.array.dimensions[0] = (ZarrDimensionProperties){ - .name = "t", - .type = ZarrDimensionType_Time, - .array_size_px = 0, - .chunk_size_px = 32, - .shard_size_chunks = 1 - }; - - // Y dimension - 48 pixels - settings.array.dimensions[1] = (ZarrDimensionProperties){ - .name = "y", - .type = ZarrDimensionType_Space, - .array_size_px = 48, - .chunk_size_px = 16, - .shard_size_chunks = 1 - }; - - // X dimension - 64 pixels - settings.array.dimensions[2] = (ZarrDimensionProperties){ - .name = "x", - .type = ZarrDimensionType_Space, - .array_size_px = 64, - .chunk_size_px = 32, - .shard_size_chunks = 1 - }; - - // Create stream - ZarrStream* stream = ZarrStream_create(&settings); - // Free Dimension array - ZarrArraySettings_destroy_dimension_array(&settings.array); - - if (!stream) { - fprintf(stderr, "Failed to create stream\n"); - return 1; - } - - // Create sample data - const size_t width = 64; - const size_t height = 48; - int32_t* frame = (int32_t*)malloc(width * height * sizeof(int32_t)); - - // Write some frames - size_t bytes_written; - for (int i = 0; i < 10; i++) { - // Fill frame with sample data - for (size_t j = 0; j < width * height; j++) { - frame[j] = i * 1000 + j; - } - - ZarrStatusCode status = ZarrStream_append( - stream, - frame, - width * height * sizeof(int32_t), - &bytes_written - ); - - if (status != ZarrStatusCode_Success) { - fprintf(stderr, "Failed to append frame: %s\n", - Zarr_get_status_message(status)); - break; - } - } - - // Cleanup - free(frame); - ZarrStream_destroy(stream); - return 0; -} \ No newline at end of file diff --git a/examples/stream-compressed-multiscale-to-s3.c b/examples/stream-compressed-multiscale-to-s3.c index 21cf1ca6..2e003331 100644 --- a/examples/stream-compressed-multiscale-to-s3.c +++ b/examples/stream-compressed-multiscale-to-s3.c @@ -33,9 +33,8 @@ main() .multiscale = true, }; ZarrStreamSettings settings = { - .store_path = "output_v3_compressed_multiscale_s3.zarr", + .store_path = "output_compressed_multiscale_s3.zarr", .s3_settings = &s3, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/examples/stream-compressed-to-filesystem.c b/examples/stream-compressed-to-filesystem.c index addf268f..fe275e11 100644 --- a/examples/stream-compressed-to-filesystem.c +++ b/examples/stream-compressed-to-filesystem.c @@ -23,9 +23,8 @@ main() .data_type = ZarrDataType_uint16, }; ZarrStreamSettings settings = { - .store_path = "output_v3_compressed.zarr", + .store_path = "output_compressed.zarr", .s3_settings = NULL, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/examples/stream-compressed-to-s3.c b/examples/stream-compressed-to-s3.c index de652198..fbcfef39 100644 --- a/examples/stream-compressed-to-s3.c +++ b/examples/stream-compressed-to-s3.c @@ -31,9 +31,8 @@ main() .data_type = ZarrDataType_uint16, }; ZarrStreamSettings settings = { - .store_path = "output_v3_compressed_s3.zarr", + .store_path = "output_compressed_s3.zarr", .s3_settings = &s3, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/examples/stream-multiarray-to-filesystem.cpp b/examples/stream-multiarray-to-filesystem.cpp index c16eca9c..895a2fc7 100644 --- a/examples/stream-multiarray-to-filesystem.cpp +++ b/examples/stream-multiarray-to-filesystem.cpp @@ -41,7 +41,6 @@ main() ZarrStreamSettings settings = { .store_path = "output_multiarray.zarr", .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .overwrite = true, }; diff --git a/examples/stream-raw-multiscale-to-filesystem.c b/examples/stream-raw-multiscale-to-filesystem.c index 74339f30..c0396213 100644 --- a/examples/stream-raw-multiscale-to-filesystem.c +++ b/examples/stream-raw-multiscale-to-filesystem.c @@ -16,9 +16,8 @@ main() .multiscale = true, }; ZarrStreamSettings settings = { - .store_path = "output_v3_multiscale.zarr", + .store_path = "output_multiscale.zarr", .s3_settings = NULL, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/examples/stream-raw-to-filesystem.c b/examples/stream-raw-to-filesystem.c index 1a439394..2b3a4740 100644 --- a/examples/stream-raw-to-filesystem.c +++ b/examples/stream-raw-to-filesystem.c @@ -15,9 +15,8 @@ main() .data_type = ZarrDataType_uint16, }; ZarrStreamSettings settings = { - .store_path = "output_v3.zarr", + .store_path = "output.zarr", .s3_settings = NULL, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/examples/stream-raw-to-s3.c b/examples/stream-raw-to-s3.c index 47a81b77..812a4949 100644 --- a/examples/stream-raw-to-s3.c +++ b/examples/stream-raw-to-s3.c @@ -19,9 +19,8 @@ int main() { .data_type = ZarrDataType_uint16, }; ZarrStreamSettings settings = { - .store_path = "output_v3_s3.zarr", + .store_path = "output_s3.zarr", .s3_settings = &s3, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/include/acquire.zarr.h b/include/acquire.zarr.h index 5e558c18..97d869e6 100644 --- a/include/acquire.zarr.h +++ b/include/acquire.zarr.h @@ -23,8 +23,6 @@ extern "C" const char* store_path; /**< Path to the store. Filesystem path or S3 key prefix. */ ZarrS3Settings* s3_settings; /**< Optional S3 settings for the store. */ - ZarrVersion - version; /**< The version of the Zarr format to use. 2 or 3. */ unsigned int max_threads; /**< The maximum number of threads to use in the stream. Set to 0 to use the supported number of concurrent threads. */ diff --git a/include/zarr.types.h b/include/zarr.types.h index 9c845dc0..6512e551 100644 --- a/include/zarr.types.h +++ b/include/zarr.types.h @@ -26,13 +26,6 @@ extern "C" ZarrStatusCodeCount, } ZarrStatusCode; - typedef enum - { - ZarrVersion_2 = 2, - ZarrVersion_3, - ZarrVersionCount - } ZarrVersion; - typedef enum { ZarrLogLevel_Debug = 0, diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp index 03834613..e926ded2 100644 --- a/python/acquire-zarr-py.cpp +++ b/python/acquire-zarr-py.cpp @@ -856,9 +856,6 @@ class PyZarrStreamSettings s3_settings_ = settings; } - ZarrVersion version() const { return version_; } - void set_version(ZarrVersion version) { version_ = version; } - unsigned int max_threads() const { return max_threads_; } void set_max_threads(unsigned int max_threads) { @@ -887,7 +884,6 @@ class PyZarrStreamSettings memset(&settings_, 0, sizeof(settings_)); settings_.store_path = store_path_.c_str(); - settings_.version = version_; settings_.max_threads = max_threads_; settings_.overwrite = static_cast(overwrite_); @@ -982,7 +978,6 @@ class PyZarrStreamSettings private: std::string store_path_; mutable std::optional s3_settings_{ std::nullopt }; - ZarrVersion version_{ ZarrVersion_3 }; unsigned int max_threads_{ std::thread::hardware_concurrency() }; bool overwrite_{ false }; @@ -1232,10 +1227,6 @@ PYBIND11_MODULE(acquire_zarr, m) "VectorDimension"); py::bind_vector>(m, "VectorArraySettings"); - py::enum_(m, "ZarrVersion") - .value("V2", ZarrVersion_2) - .value("V3", ZarrVersion_3); - py::enum_(m, "DataType") .value(data_type_to_str(ZarrDataType_uint8), ZarrDataType_uint8) .value(data_type_to_str(ZarrDataType_uint16), ZarrDataType_uint16) @@ -1949,7 +1940,6 @@ PYBIND11_MODULE(acquire_zarr, m) py::class_(m, "StreamSettings", py::dynamic_attr()) .def(py::init([](std::optional store_path, std::optional s3, - std::optional version, std::optional max_threads, std::optional overwrite, std::optional arrays, @@ -1961,9 +1951,6 @@ PYBIND11_MODULE(acquire_zarr, m) if (s3) { settings.set_s3(*s3); } - if (version) { - settings.set_version(*version); - } if (max_threads) { settings.set_max_threads(*max_threads); } @@ -1994,7 +1981,6 @@ PYBIND11_MODULE(acquire_zarr, m) py::kw_only(), py::arg("store_path") = std::nullopt, py::arg("s3") = std::nullopt, - py::arg("version") = std::nullopt, py::arg("max_threads") = std::nullopt, py::arg("overwrite") = std::nullopt, py::arg("arrays") = std::nullopt, @@ -2008,8 +1994,6 @@ PYBIND11_MODULE(acquire_zarr, m) repr += ", s3=" + self.s3()->repr(); } repr += - ", version=ZarrVersion." + - std::string(self.version() == ZarrVersion_2 ? "V2" : "V3") + ", max_threads=" + std::to_string(self.max_threads()) + "," + (self.overwrite() ? " overwrite=True" : " overwrite=False") + ")"; @@ -2036,9 +2020,6 @@ PYBIND11_MODULE(acquire_zarr, m) self.set_s3(obj.cast()); } }) - .def_property("version", - &PyZarrStreamSettings::version, - &PyZarrStreamSettings::set_version) .def_property("max_threads", &PyZarrStreamSettings::max_threads, &PyZarrStreamSettings::set_max_threads) diff --git a/python/acquire_zarr/__init__.pyi b/python/acquire_zarr/__init__.pyi index a521aab3..2ca6967f 100644 --- a/python/acquire_zarr/__init__.pyi +++ b/python/acquire_zarr/__init__.pyi @@ -30,7 +30,6 @@ __all__ = [ "StreamSettings", "Well", "ZarrStream", - "ZarrVersion", "get_log_level", "set_log_level", ] @@ -376,7 +375,6 @@ class StreamSettings: store_path: Path to the store. Can be a filesystem path or S3 key prefix. For S3, this becomes the key prefix within the specified bucket. s3: Optional S3 settings for cloud storage. If None, writes to local filesystem. - version: Zarr format version to use (V2 or V3). max_threads: Maximum number of threads for parallel processing. custom_metadata: Optional JSON-formatted custom metadata to include in the dataset. overwrite: If True, removes any existing data at store_path before writing. @@ -391,7 +389,6 @@ class StreamSettings: custom_metadata: Optional[str] s3: Optional[S3Settings] store_path: str - version: ZarrVersion max_threads: int overwrite: bool plates: List[Plate] @@ -433,36 +430,6 @@ class ZarrStream: def get_current_memory_usage(self) -> int: """Get the current memory usage of the stream in bytes.""" -class ZarrVersion: - """ - Zarr format version. - - Attributes: - V2: Zarr format version 2 - V3: Zarr format version 3 - """ - - V2: ClassVar[ZarrVersion] # value = - V3: ClassVar[ZarrVersion] # value = - __members__: ClassVar[ - dict[str, ZarrVersion] - ] # value = {'V2': , 'V3': } - - def __eq__(self, other: Any) -> bool: ... - def __getstate__(self) -> int: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __init__(self, value: int) -> None: ... - def __int__(self) -> int: ... - def __ne__(self, other: Any) -> bool: ... - def __repr__(self) -> str: ... - def __setstate__(self, state: int) -> None: ... - def __str__(self) -> str: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - def get_log_level() -> LogLevel: """Get the current log level for the Zarr API""" diff --git a/python/tests/test_settings.py b/python/tests/test_settings.py index a1df494b..2778dabb 100644 --- a/python/tests/test_settings.py +++ b/python/tests/test_settings.py @@ -222,13 +222,6 @@ def test_set_dimensions_in_constructor(): assert settings.dimensions[2].shard_size_chunks == 9 -def test_set_version(settings): - assert settings.version == aqz.ZarrVersion.V3 - - settings.version = aqz.ZarrVersion.V2 - assert settings.version == aqz.ZarrVersion.V2 - - def test_set_max_threads(settings): assert ( settings.max_threads > 0 diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py index f6e80166..9eb2bf2f 100644 --- a/python/tests/test_stream.py +++ b/python/tests/test_stream.py @@ -29,7 +29,6 @@ S3Settings, Dimension, DimensionType, - ZarrVersion, LogLevel, DownsamplingMethod, Plate, @@ -310,21 +309,12 @@ def validate_v3_metadata(store_path: Path): assert not (store_path / "acquire.json").is_file() -@pytest.mark.parametrize( - ("version",), - [ - (ZarrVersion.V2,), - (ZarrVersion.V3,), - ], -) def test_create_stream( settings: StreamSettings, store_path: Path, request: pytest.FixtureRequest, - version: ZarrVersion, ): settings.store_path = str(store_path / f"{request.node.name}.zarr") - settings.version = version stream = ZarrStream(settings) assert stream @@ -335,46 +325,24 @@ def test_create_stream( # check that the stream created the zarr store assert store_path.is_dir() - if version == ZarrVersion.V2: - validate_v2_metadata(store_path) - - # no data written, so no array metadata - assert not (store_path / "0" / ".zarray").exists() - else: - validate_v3_metadata(store_path) + validate_v3_metadata(store_path) - # no data written, so no array metadata - assert not (store_path / "meta" / "0.array.json").exists() + # no data written, so no array metadata + assert not (store_path / "meta" / "0.array.json").exists() @pytest.mark.parametrize( ( - "version", "compression_codec", ), [ ( - ZarrVersion.V2, None, ), ( - ZarrVersion.V2, CompressionCodec.BLOSC_LZ4, ), ( - ZarrVersion.V2, - CompressionCodec.BLOSC_ZSTD, - ), - ( - ZarrVersion.V3, - None, - ), - ( - ZarrVersion.V3, - CompressionCodec.BLOSC_LZ4, - ), - ( - ZarrVersion.V3, CompressionCodec.BLOSC_ZSTD, ), ], @@ -382,11 +350,9 @@ def test_create_stream( def test_stream_data_to_filesystem( settings: StreamSettings, store_path: Path, - version: ZarrVersion, compression_codec: Optional[CompressionCodec], ): settings.store_path = str(store_path / "test.zarr") - settings.version = version if compression_codec is not None: settings.arrays[0].compression = CompressionSettings( compressor=Compressor.BLOSC1, @@ -420,10 +386,9 @@ def test_stream_data_to_filesystem( shard_size_bytes = chunk_size_bytes table_size_bytes = 16 # 2 * sizeof(uint64_t) - if version == ZarrVersion.V3: - for dim in settings.arrays[0].dimensions: - shard_size_bytes *= dim.shard_size_chunks - table_size_bytes *= dim.shard_size_chunks + for dim in settings.arrays[0].dimensions: + shard_size_bytes *= dim.shard_size_chunks + table_size_bytes *= dim.shard_size_chunks shard_size_bytes = ( shard_size_bytes + table_size_bytes + 4 ) # 4 bytes for crc32c checksum @@ -437,86 +402,45 @@ def test_stream_data_to_filesystem( metadata = array.metadata if compression_codec is not None: - if version == ZarrVersion.V2: - cname = ( - "lz4" - if compression_codec == CompressionCodec.BLOSC_LZ4 - else "zstd" - ) - compressor = metadata.compressor - assert compressor.cname == cname - assert compressor.clevel == 1 - assert compressor.shuffle == ncblosc.SHUFFLE - - # check that the data is compressed - assert (store_path / "test.zarr" / "0" / "0" / "0" / "0").is_file() - assert ( - store_path / "test.zarr" / "0" / "0" / "0" / "0" - ).stat().st_size <= chunk_size_bytes - else: - cname = ( - zblosc.BloscCname.lz4 - if compression_codec == CompressionCodec.BLOSC_LZ4 - else zblosc.BloscCname.zstd - ) - blosc_codec = metadata.codecs[0].codecs[1] - assert blosc_codec.cname == cname - assert blosc_codec.clevel == 1 - assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle + cname = ( + zblosc.BloscCname.lz4 + if compression_codec == CompressionCodec.BLOSC_LZ4 + else zblosc.BloscCname.zstd + ) + blosc_codec = metadata.codecs[0].codecs[1] + assert blosc_codec.cname == cname + assert blosc_codec.clevel == 1 + assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).is_file() - assert ( + assert ( store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).stat().st_size <= shard_size_bytes + ).is_file() + assert ( + store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" + ).stat().st_size <= shard_size_bytes else: - if version == ZarrVersion.V2: - assert metadata.compressor is None - - assert (store_path / "test.zarr" / "0" / "0" / "0" / "0").is_file() - assert ( - store_path / "test.zarr" / "0" / "0" / "0" / "0" - ).stat().st_size == chunk_size_bytes - else: - assert len(metadata.codecs[0].codecs) == 1 + assert len(metadata.codecs[0].codecs) == 1 - assert ( + assert ( store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).is_file() - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).stat().st_size == shard_size_bytes + ).is_file() + assert ( + store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" + ).stat().st_size == shard_size_bytes @pytest.mark.parametrize( ( - "version", "compression_codec", ), [ ( - ZarrVersion.V2, - None, - ), - ( - ZarrVersion.V2, - CompressionCodec.BLOSC_LZ4, - ), - ( - ZarrVersion.V2, - CompressionCodec.BLOSC_ZSTD, - ), - ( - ZarrVersion.V3, None, ), ( - ZarrVersion.V3, CompressionCodec.BLOSC_LZ4, ), ( - ZarrVersion.V3, CompressionCodec.BLOSC_ZSTD, ), ], @@ -525,7 +449,6 @@ def test_stream_data_to_s3( settings: StreamSettings, s3_settings: Optional[S3Settings], request: pytest.FixtureRequest, - version: ZarrVersion, compression_codec: Optional[CompressionCodec], ): if s3_settings is None: @@ -534,7 +457,6 @@ def test_stream_data_to_s3( settings.store_path = f"{request.node.name}.zarr".replace("[", "").replace( "]", "" ) - settings.version = version settings.s3 = s3_settings settings.data_type = np.uint16 if compression_codec is not None: @@ -579,31 +501,17 @@ def test_stream_data_to_s3( metadata = array.metadata if compression_codec is not None: - if version == ZarrVersion.V2: - cname = ( - "lz4" - if compression_codec == CompressionCodec.BLOSC_LZ4 - else "zstd" - ) - compressor = metadata.compressor - assert compressor.cname == cname - assert compressor.clevel == 1 - assert compressor.shuffle == ncblosc.SHUFFLE - else: - cname = ( - zblosc.BloscCname.lz4 - if compression_codec == CompressionCodec.BLOSC_LZ4 - else zblosc.BloscCname.zstd - ) - blosc_codec = metadata.codecs[0].codecs[1] - assert blosc_codec.cname == cname - assert blosc_codec.clevel == 1 - assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle + cname = ( + zblosc.BloscCname.lz4 + if compression_codec == CompressionCodec.BLOSC_LZ4 + else zblosc.BloscCname.zstd + ) + blosc_codec = metadata.codecs[0].codecs[1] + assert blosc_codec.cname == cname + assert blosc_codec.clevel == 1 + assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle else: - if version == ZarrVersion.V2: - assert metadata.compressor is None - else: - assert len(metadata.codecs[0].codecs) == 1 + assert len(metadata.codecs[0].codecs) == 1 # cleanup s3 = s3fs.S3FileSystem( @@ -630,23 +538,18 @@ def test_set_log_level(level: LogLevel): @pytest.mark.parametrize( - ("version", "overwrite"), + ("overwrite",), [ - (ZarrVersion.V2, False), - (ZarrVersion.V2, True), - (ZarrVersion.V3, False), - (ZarrVersion.V3, True), + (False,), + (True,), ], ) def test_write_custom_metadata( settings: StreamSettings, store_path: Path, - request: pytest.FixtureRequest, - version: ZarrVersion, overwrite: bool, ): - settings.store_path = str(store_path / f"{request.node.name}.zarr") - settings.version = version + settings.store_path = str(store_path / "test.zarr") stream = ZarrStream(settings) assert stream @@ -722,7 +625,6 @@ def test_write_transposed_array( ] ) settings.store_path = str(store_path / "test.zarr") - settings.version = ZarrVersion.V3 data = np.random.randint( -(2**16), @@ -786,7 +688,6 @@ def test_column_ragged_sharding( ] ) settings.store_path = str(store_path / "test.zarr") - settings.version = ZarrVersion.V3 data = np.random.randint( -(2**16), @@ -850,7 +751,6 @@ def test_custom_dimension_units_and_scales(store_path: Path): ] ) settings.store_path = str(store_path / "test.zarr") - settings.version = ZarrVersion.V3 data = np.random.randint( -(2**16), @@ -945,7 +845,6 @@ def test_2d_multiscale_stream(store_path: Path, method: DownsamplingMethod): ] ) settings.store_path = str(store_path / "test.zarr") - settings.version = ZarrVersion.V3 data = np.random.randint( -(2**16), @@ -1033,7 +932,6 @@ def test_3d_multiscale_stream(store_path: Path, method: DownsamplingMethod): ] ) settings.store_path = str(store_path / "test.zarr") - settings.version = ZarrVersion.V3 data = np.random.randint( 0, @@ -1123,7 +1021,6 @@ def test_stream_data_to_named_array( store_path / f"stream_to_named_array_{output_key.replace('/', '_')}.zarr" ) - settings.version = ZarrVersion.V3 settings.arrays[0].output_key = output_key settings.arrays[0].downsampling_method = downsampling_method settings.arrays[0].data_type = np.uint16 @@ -1176,7 +1073,6 @@ def test_stream_data_to_named_array( def test_anisotropic_downsampling(settings: StreamSettings, store_path: Path): settings.store_path = str(store_path / "anisotropic_downsampling.zarr") - settings.version = ZarrVersion.V3 settings.arrays[0].data_type = np.uint8 settings.arrays[0].downsampling_method = DownsamplingMethod.MEAN settings.arrays[0].dimensions = [ @@ -1247,20 +1143,11 @@ def test_anisotropic_downsampling(settings: StreamSettings, store_path: Path): assert "4" not in group # No further downsampling -@pytest.mark.parametrize( - ("version",), - [ - (ZarrVersion.V2,), - (ZarrVersion.V3,), - ], -) def test_multiarray_metadata_structure( settings: StreamSettings, store_path: Path, - version: ZarrVersion, ): settings.store_path = str(store_path / "multiarray_metadata_test.zarr") - settings.version = version # Configure three arrays matching the JSON examples @@ -1426,13 +1313,9 @@ def test_multiarray_metadata_structure( array1_group = root_group["path"]["to"]["array1"] # Check multiscale metadata - if version == ZarrVersion.V2: - assert "multiscales" in array1_group.attrs - assert len(array1_group.attrs["multiscales"]) > 0 - else: - assert "ome" in array1_group.attrs - assert "multiscales" in array1_group.attrs["ome"] - assert len(array1_group.attrs["ome"]["multiscales"]) > 0 + assert "ome" in array1_group.attrs + assert "multiscales" in array1_group.attrs["ome"] + assert len(array1_group.attrs["ome"]["multiscales"]) > 0 # Check that all 3 LOD levels exist assert "0" in array1_group # LOD 0 (full resolution) @@ -1628,7 +1511,6 @@ def test_pure_hcs_acquisition(store_path: Path): settings = StreamSettings( store_path=str(store_path / "test.zarr"), - version=ZarrVersion.V3, overwrite=True, arrays=[], # No flat arrays, only HCS hcs_plates=[plate], @@ -1699,7 +1581,6 @@ def test_mixed_flat_and_hcs_acquisition(store_path: Path): settings = StreamSettings( store_path=str(store_path / "test.zarr"), - version=ZarrVersion.V3, overwrite=True, arrays=[labels_array], hcs_plates=[plate], diff --git a/setup.py b/setup.py index c9599a22..17c062b5 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def build_extension(self, ext): build_dir = os.path.abspath(os.path.join(ext.sourcedir, "build")) - cfg = "Debug" if self.debug else "Release" + cfg = "Debug" # if self.debug else "Release" cmake_args = [ "--preset=default", diff --git a/src/streaming/array.base.cpp b/src/streaming/array.base.cpp index c7470a25..ad98bccd 100644 --- a/src/streaming/array.base.cpp +++ b/src/streaming/array.base.cpp @@ -103,34 +103,20 @@ std::unique_ptr zarr::make_array(std::shared_ptr config, std::shared_ptr thread_pool, std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool, - ZarrVersion format) + std::shared_ptr s3_connection_pool) { // create a multiscale array at the dataset root (node_key is empty) or if // we have a genuine multiscale dataset const auto multiscale = config->node_key.empty() || config->downsampling_method.has_value(); - EXPECT(format < ZarrVersionCount, - "Invalid Zarr format: ", - static_cast(format)); std::unique_ptr array; if (multiscale) { - if (format == ZarrVersion_2) { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } else { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } + array = std::make_unique( + config, thread_pool, file_handle_pool, s3_connection_pool); } else { - if (format == ZarrVersion_2) { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } else { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } + array = std::make_unique( + config, thread_pool, file_handle_pool, s3_connection_pool); } return array; diff --git a/src/streaming/array.base.hh b/src/streaming/array.base.hh index c18d99dc..dfa4075a 100644 --- a/src/streaming/array.base.hh +++ b/src/streaming/array.base.hh @@ -99,11 +99,10 @@ class ArrayBase }; std::unique_ptr -make_array(std::shared_ptr config, +make_array(std::shared_ptr config, std::shared_ptr thread_pool, std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool, - ZarrVersion format); + std::shared_ptr s3_connection_pool); [[nodiscard]] bool finalize_array(std::unique_ptr&& array); diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index 30377974..ea3a4430 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -312,7 +312,6 @@ make_array_config(const ZarrArraySettings* settings, [[nodiscard]] bool validate_dimension(const ZarrDimensionProperties* dimension, - ZarrVersion version, bool is_append, std::string& error) { @@ -337,7 +336,7 @@ validate_dimension(const ZarrDimensionProperties* dimension, return false; } - if (version == ZarrVersion_3 && dimension->shard_size_chunks == 0) { + if (dimension->shard_size_chunks == 0) { error = "Shard size must be nonzero"; return false; } @@ -353,7 +352,6 @@ validate_dimension(const ZarrDimensionProperties* dimension, [[nodiscard]] bool validate_array_settings(const ZarrArraySettings* settings, const std::string& parent_path, - ZarrVersion version, std::string& error) { if (settings == nullptr) { @@ -401,8 +399,7 @@ validate_array_settings(const ZarrArraySettings* settings, // validate the dimensions individually for (size_t i = 0; i < ndims; ++i) { - if (!validate_dimension( - settings->dimensions + i, version, i == 0, error)) { + if (!validate_dimension(settings->dimensions + i, i == 0, error)) { return false; } } @@ -1011,15 +1008,6 @@ ZarrStream_s::validate_settings_(const struct ZarrStreamSettings_s* settings) return false; } - auto version = settings->version; - if (version < ZarrVersion_2 || version >= ZarrVersionCount) { - error_ = "Invalid Zarr version: " + std::to_string(version); - return false; - } else if (version == ZarrVersion_2) { - LOG_WARNING("Zarr version 2 is deprecated and will be removed in a " - "future release"); - } - if (settings->store_path == nullptr) { error_ = "Null pointer: store_path"; return false; @@ -1043,7 +1031,7 @@ ZarrStream_s::validate_settings_(const struct ZarrStreamSettings_s* settings) // validate the arrays individually for (auto i = 0; i < settings->array_count; ++i) { const auto& array_settings = settings->arrays[i]; - if (!validate_array_settings(&array_settings, "", version, error_)) { + if (!validate_array_settings(&array_settings, "", error_)) { return false; } } @@ -1129,7 +1117,7 @@ ZarrStream_s::validate_settings_(const struct ZarrStreamSettings_s* settings) std::string parent_path = plate_path; parent_path += "/" + row_name + "/" + col_name; if (!validate_array_settings( - field.array_settings, parent_path, version, error_)) { + field.array_settings, parent_path, error_)) { return false; } @@ -1174,7 +1162,7 @@ ZarrStream_s::configure_array_(const ZarrArraySettings* settings, .frame_buffer_offset = 0 }; try { output_node.array = zarr::make_array( - config, thread_pool_, file_handle_pool_, s3_connection_pool_, version_); + config, thread_pool_, file_handle_pool_, s3_connection_pool_); } catch (const std::exception& exc) { set_error_(exc.what()); } @@ -1203,11 +1191,6 @@ ZarrStream_s::commit_hcs_settings_(const ZarrHCSSettings* hcs_settings) return true; // nothing to do } - if (version_ == ZarrVersion_2) { - set_error_("HCS settings are not supported in Zarr version 2"); - return false; - } - plates_.clear(); wells_.clear(); @@ -1313,7 +1296,6 @@ ZarrStream_s::commit_hcs_settings_(const ZarrHCSSettings* hcs_settings) bool ZarrStream_s::commit_settings_(const struct ZarrStreamSettings_s* settings) { - version_ = settings->version; store_path_ = zarr::trim(settings->store_path); std::optional bucket_name; @@ -1430,16 +1412,13 @@ ZarrStream_s::write_intermediate_metadata_() bucket_name = s3_settings_->bucket_name; } - const nlohmann::json group_metadata = - version_ == ZarrVersion_2 ? nlohmann::json({ { "zarr_format", 2 } }) - : nlohmann::json({ - { "zarr_format", 3 }, - { "consolidated_metadata", nullptr }, - { "node_type", "group" }, - { "attributes", nlohmann::json::object() }, - }); - const std::string metadata_key = - version_ == ZarrVersion_2 ? ".zgroup" : "zarr.json"; + const nlohmann::json group_metadata = nlohmann::json({ + { "zarr_format", 3 }, + { "consolidated_metadata", nullptr }, + { "node_type", "group" }, + { "attributes", nlohmann::json::object() }, + }); + const std::string metadata_key = "zarr.json"; std::string metadata_str; for (const auto& parent_group_key : intermediate_group_paths_) { diff --git a/src/streaming/zarr.stream.hh b/src/streaming/zarr.stream.hh index be56cd0f..40d2fd0d 100644 --- a/src/streaming/zarr.stream.hh +++ b/src/streaming/zarr.stream.hh @@ -65,7 +65,6 @@ struct ZarrStream_s std::string error_; // error message. If nonempty, an error occurred. - ZarrVersion version_; std::string store_path_; std::optional s3_settings_; diff --git a/tests/integration/estimate-memory-usage.cpp b/tests/integration/estimate-memory-usage.cpp index 5984c33c..2c487980 100644 --- a/tests/integration/estimate-memory-usage.cpp +++ b/tests/integration/estimate-memory-usage.cpp @@ -68,7 +68,7 @@ initialize_array(ZarrArraySettings& settings, void test_max_memory_usage() { - ZarrStreamSettings settings{ 0 }; + ZarrStreamSettings settings{}; // create settings for a Zarr stream with one array EXPECT(ZarrStreamSettings_create_arrays(&settings, 1) == diff --git a/tests/integration/stream-2d-multiscale-to-filesystem.cpp b/tests/integration/stream-2d-multiscale-to-filesystem.cpp index 03cb9dee..f83de4cb 100644 --- a/tests/integration/stream-2d-multiscale-to-filesystem.cpp +++ b/tests/integration/stream-2d-multiscale-to-filesystem.cpp @@ -58,7 +58,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-3d-multiscale-to-filesystem.cpp b/tests/integration/stream-3d-multiscale-to-filesystem.cpp index 45012aa5..84bd85d1 100644 --- a/tests/integration/stream-3d-multiscale-to-filesystem.cpp +++ b/tests/integration/stream-3d-multiscale-to-filesystem.cpp @@ -63,7 +63,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-compressed-to-filesystem.cpp b/tests/integration/stream-compressed-to-filesystem.cpp index d4b13d9b..86988b42 100644 --- a/tests/integration/stream-compressed-to-filesystem.cpp +++ b/tests/integration/stream-compressed-to-filesystem.cpp @@ -61,7 +61,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-compressed-to-s3.cpp b/tests/integration/stream-compressed-to-s3.cpp index 72e3eba1..a817c515 100644 --- a/tests/integration/stream-compressed-to-s3.cpp +++ b/tests/integration/stream-compressed-to-s3.cpp @@ -179,7 +179,6 @@ setup() }; ZarrStreamSettings settings = { .store_path = TEST, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-mixed-flat-and-hcs-acquisition.cpp b/tests/integration/stream-mixed-flat-and-hcs-acquisition.cpp index 31c986e0..d2e08f75 100644 --- a/tests/integration/stream-mixed-flat-and-hcs-acquisition.cpp +++ b/tests/integration/stream-mixed-flat-and-hcs-acquisition.cpp @@ -245,7 +245,6 @@ make_mixed_stream() ZarrStreamSettings settings = { .store_path = TEST ".zarr", - .version = ZarrVersion_3, .overwrite = true, .arrays = &label_array, .array_count = 1, diff --git a/tests/integration/stream-multi-frame-append.cpp b/tests/integration/stream-multi-frame-append.cpp index 6e88852b..c9e04e0c 100644 --- a/tests/integration/stream-multi-frame-append.cpp +++ b/tests/integration/stream-multi-frame-append.cpp @@ -30,7 +30,6 @@ setup() { }; ZarrStreamSettings settings = { .store_path = test_path_cstr, - .version = ZarrVersion_3, .arrays = &array, .array_count = 1, }; diff --git a/tests/integration/stream-multiple-arrays-to-filesystem.cpp b/tests/integration/stream-multiple-arrays-to-filesystem.cpp index 5b59d3df..56cdd967 100644 --- a/tests/integration/stream-multiple-arrays-to-filesystem.cpp +++ b/tests/integration/stream-multiple-arrays-to-filesystem.cpp @@ -20,7 +20,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .overwrite = true, }; diff --git a/tests/integration/stream-multiscale-trivial-3rd-dim.cpp b/tests/integration/stream-multiscale-trivial-3rd-dim.cpp index 426e960f..e282aaf8 100644 --- a/tests/integration/stream-multiscale-trivial-3rd-dim.cpp +++ b/tests/integration/stream-multiscale-trivial-3rd-dim.cpp @@ -51,7 +51,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-named-array-to-filesystem.cpp b/tests/integration/stream-named-array-to-filesystem.cpp index 4a61afd3..d3c2208a 100644 --- a/tests/integration/stream-named-array-to-filesystem.cpp +++ b/tests/integration/stream-named-array-to-filesystem.cpp @@ -63,7 +63,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-named-array-to-s3.cpp b/tests/integration/stream-named-array-to-s3.cpp index 157f52e9..4412a1b7 100644 --- a/tests/integration/stream-named-array-to-s3.cpp +++ b/tests/integration/stream-named-array-to-s3.cpp @@ -181,7 +181,6 @@ setup() }; ZarrStreamSettings settings = { .store_path = TEST, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-pure-hcs-acquisition.cpp b/tests/integration/stream-pure-hcs-acquisition.cpp index ef6e9d8b..bbdd1f87 100644 --- a/tests/integration/stream-pure-hcs-acquisition.cpp +++ b/tests/integration/stream-pure-hcs-acquisition.cpp @@ -214,7 +214,6 @@ make_hcs_stream() }; ZarrStreamSettings settings = { .store_path = TEST ".zarr", - .version = ZarrVersion_3, .overwrite = true, .hcs_settings = &hcs_settings, }; diff --git a/tests/integration/stream-raw-to-filesystem.cpp b/tests/integration/stream-raw-to-filesystem.cpp index d83be93e..974cddcd 100644 --- a/tests/integration/stream-raw-to-filesystem.cpp +++ b/tests/integration/stream-raw-to-filesystem.cpp @@ -62,7 +62,6 @@ setup() ZarrStreamSettings settings = { .store_path = test_path.c_str(), .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, @@ -70,8 +69,7 @@ setup() CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); - ZarrDimensionProperties* dim; - dim = settings.arrays->dimensions; + ZarrDimensionProperties* dim = settings.arrays->dimensions; *dim = DIM("t", ZarrDimensionType_Time, array_timepoints, diff --git a/tests/integration/stream-raw-to-s3.cpp b/tests/integration/stream-raw-to-s3.cpp index d8173420..3fbddeb0 100644 --- a/tests/integration/stream-raw-to-s3.cpp +++ b/tests/integration/stream-raw-to-s3.cpp @@ -182,7 +182,6 @@ setup() }; ZarrStreamSettings settings = { .store_path = TEST, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .arrays = &array, .array_count = 1, diff --git a/tests/integration/stream-with-ragged-final-shard.cpp b/tests/integration/stream-with-ragged-final-shard.cpp index 80d3f637..0c65ab74 100644 --- a/tests/integration/stream-with-ragged-final-shard.cpp +++ b/tests/integration/stream-with-ragged-final-shard.cpp @@ -19,7 +19,6 @@ main() ZarrStreamSettings settings = { .store_path = TEST ".zarr", .s3_settings = nullptr, - .version = ZarrVersion_3, .max_threads = 0, // use all available threads .overwrite = true, .arrays = &array, diff --git a/tests/unit-tests/create-stream-with-metadata.cpp b/tests/unit-tests/create-stream-with-metadata.cpp index 691e2a3a..54867986 100644 --- a/tests/unit-tests/create-stream-with-metadata.cpp +++ b/tests/unit-tests/create-stream-with-metadata.cpp @@ -41,11 +41,10 @@ configure_stream_dimensions(ZarrArraySettings* settings) } ZarrStream* -create_stream_with_metadata(ZarrVersion version) +create_stream_with_metadata() { ZarrStreamSettings settings; memset(&settings, 0, sizeof(settings)); - settings.version = version; settings.max_threads = 0; settings.store_path = TEST ".zarr"; @@ -68,11 +67,10 @@ create_stream_with_metadata(ZarrVersion version) } ZarrStream* -create_stream_no_metadata(ZarrVersion version) +create_stream_no_metadata() { ZarrStreamSettings settings; memset(&settings, 0, sizeof(settings)); - settings.version = version; settings.max_threads = 0; settings.store_path = TEST ".zarr"; @@ -120,7 +118,7 @@ main() try { { - auto* stream = create_stream_no_metadata(ZarrVersion_2); + auto* stream = create_stream_no_metadata(); CHECK(stream); check_files(false); ZarrStream_destroy(stream); @@ -129,25 +127,7 @@ main() } { - auto* stream = create_stream_with_metadata(ZarrVersion_2); - CHECK(stream); - check_files(true); - ZarrStream_destroy(stream); - - CHECK(destroy_directory()); - } - - { - auto* stream = create_stream_no_metadata(ZarrVersion_3); - CHECK(stream); - check_files(false); - ZarrStream_destroy(stream); - - CHECK(destroy_directory()); - } - - { - auto* stream = create_stream_with_metadata(ZarrVersion_3); + auto* stream = create_stream_with_metadata(); CHECK(stream); check_files(true); ZarrStream_destroy(stream); diff --git a/tests/unit-tests/create-stream.cpp b/tests/unit-tests/create-stream.cpp index 7adbd333..b62afe07 100644 --- a/tests/unit-tests/create-stream.cpp +++ b/tests/unit-tests/create-stream.cpp @@ -18,6 +18,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Time, .array_size_px = 100, .chunk_size_px = 10, + .shard_size_chunks = 1, }; dim = settings->dimensions + 1; @@ -26,6 +27,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Space, .array_size_px = 200, .chunk_size_px = 20, + .shard_size_chunks = 1, }; dim = settings->dimensions + 2; @@ -34,6 +36,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Space, .array_size_px = 300, .chunk_size_px = 30, + .shard_size_chunks = 1, }; } @@ -42,10 +45,8 @@ main() { int retval = 1; - ZarrStream* stream; - ZarrStreamSettings settings; - memset(&settings, 0, sizeof(settings)); - settings.version = ZarrVersion_2; + ZarrStream* stream = nullptr; + ZarrStreamSettings settings = {}; settings.max_threads = std::thread::hardware_concurrency(); try { @@ -75,9 +76,8 @@ main() // cleanup ZarrStream_destroy(stream); - std::error_code ec; - if (fs::is_directory(settings.store_path) && - !fs::remove_all(settings.store_path, ec)) { + if (std::error_code ec; fs::is_directory(settings.store_path) && + !fs::remove_all(settings.store_path, ec)) { LOG_ERROR("Failed to remove store path: ", ec.message().c_str()); } diff --git a/tests/unit-tests/zarr-stream-partial-append.cpp b/tests/unit-tests/zarr-stream-partial-append.cpp index a2356044..b89a84c2 100644 --- a/tests/unit-tests/zarr-stream-partial-append.cpp +++ b/tests/unit-tests/zarr-stream-partial-append.cpp @@ -19,6 +19,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Time, .array_size_px = 0, .chunk_size_px = 1, + .shard_size_chunks = 1, }; dim = settings->dimensions + 1; @@ -27,6 +28,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Space, .array_size_px = 48, .chunk_size_px = 48, + .shard_size_chunks = 1, }; dim = settings->dimensions + 2; @@ -35,6 +37,7 @@ configure_stream_dimensions(ZarrArraySettings* settings) .type = ZarrDimensionType_Space, .array_size_px = 64, .chunk_size_px = 64, + .shard_size_chunks = 1, }; } @@ -45,12 +48,13 @@ verify_file_data(const ZarrStreamSettings& settings) const size_t row_size = settings.arrays->dimensions[2].array_size_px, num_rows = settings.arrays->dimensions[1].array_size_px; - fs::path chunk_path = fs::path(settings.store_path) / "0" / "0" / "0" / "0"; - CHECK(fs::is_regular_file(chunk_path)); + fs::path shard_path = + fs::path(settings.store_path) / "0" / "c" / "0" / "0" / "0"; + CHECK(fs::is_regular_file(shard_path)); // Open and read the first chunk file { - std::ifstream file(chunk_path, std::ios::binary); + std::ifstream file(shard_path, std::ios::binary); CHECK(file.is_open()); // Get file size @@ -65,7 +69,8 @@ verify_file_data(const ZarrStreamSettings& settings) } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows); + constexpr size_t table_size = 2 * sizeof(uint64_t) + 4; + EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); for (size_t row = 0; row < num_rows; ++row) { // Check each byte in this row for (size_t col = 0; col < row_size; ++col) { @@ -74,12 +79,12 @@ verify_file_data(const ZarrStreamSettings& settings) } } - chunk_path = fs::path(settings.store_path) / "0" / "1" / "0" / "0"; - CHECK(fs::is_regular_file(chunk_path)); + shard_path = fs::path(settings.store_path) / "0" / "c" / "1" / "0" / "0"; + CHECK(fs::is_regular_file(shard_path)); // Open and read the next chunk file { - std::ifstream file(chunk_path, std::ios::binary); + std::ifstream file(shard_path, std::ios::binary); CHECK(file.is_open()); // Get file size @@ -94,7 +99,7 @@ verify_file_data(const ZarrStreamSettings& settings) } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows); + EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); for (size_t row = 0; row < num_rows; ++row) { // Check each byte in this row for (size_t col = 0; col < row_size; ++col) { @@ -108,12 +113,12 @@ verify_file_data(const ZarrStreamSettings& settings) // starting at 96 and ending at 191 uint8_t px_value = 96; - chunk_path = fs::path(settings.store_path) / "0" / "2" / "0" / "0"; - CHECK(fs::is_regular_file(chunk_path)); + shard_path = fs::path(settings.store_path) / "0" / "c" / "2" / "0" / "0"; + CHECK(fs::is_regular_file(shard_path)); // Open and read the next chunk file { - std::ifstream file(chunk_path, std::ios::binary); + std::ifstream file(shard_path, std::ios::binary); CHECK(file.is_open()); // Get file size @@ -128,18 +133,18 @@ verify_file_data(const ZarrStreamSettings& settings) } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows); + EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); - for (auto i = 0; i < buffer.size(); ++i) { + for (auto i = 0; i < row_size * num_rows; ++i) { EXPECT_EQ(int, buffer[i], px_value++); } - chunk_path = fs::path(settings.store_path) / "0" / "3" / "0" / "0"; - CHECK(fs::is_regular_file(chunk_path)); + shard_path = fs::path(settings.store_path) / "0" / "c" / "3" / "0" / "0"; + CHECK(fs::is_regular_file(shard_path)); // Open and read the next chunk file { - std::ifstream file(chunk_path, std::ios::binary); + std::ifstream file(shard_path, std::ios::binary); CHECK(file.is_open()); // Get file size @@ -154,9 +159,9 @@ verify_file_data(const ZarrStreamSettings& settings) } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows); + EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); - for (auto i = 0; i < buffer.size(); ++i) { + for (auto i = 0; i < row_size * num_rows; ++i) { EXPECT_EQ(int, buffer[i], px_value++); } } @@ -166,13 +171,11 @@ main() { int retval = 1; - ZarrStream* stream; - ZarrStreamSettings settings; - memset(&settings, 0, sizeof(settings)); + ZarrStream* stream = nullptr; + ZarrStreamSettings settings = {}; Zarr_set_log_level(ZarrLogLevel_Debug); - settings.version = ZarrVersion_2; settings.store_path = static_cast(TEST ".zarr"); settings.max_threads = 0; From cda2d13046f44d871ab86370af50ef13847a44c4 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 9 Oct 2025 10:15:36 -0400 Subject: [PATCH 03/38] Remove V2Array and V2MultiscaleArray --- src/streaming/CMakeLists.txt | 4 - src/streaming/v2.array.cpp | 248 -------------------------- src/streaming/v2.array.hh | 25 --- src/streaming/v2.multiscale.array.cpp | 74 -------- src/streaming/v2.multiscale.array.hh | 22 --- 5 files changed, 373 deletions(-) delete mode 100644 src/streaming/v2.array.cpp delete mode 100644 src/streaming/v2.array.hh delete mode 100644 src/streaming/v2.multiscale.array.cpp delete mode 100644 src/streaming/v2.multiscale.array.hh diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt index 54122014..81b24bac 100644 --- a/src/streaming/CMakeLists.txt +++ b/src/streaming/CMakeLists.txt @@ -40,14 +40,10 @@ add_library(${tgt} array.base.cpp array.hh array.cpp - v2.array.hh - v2.array.cpp v3.array.hh v3.array.cpp multiscale.array.hh multiscale.array.cpp - v2.multiscale.array.hh - v2.multiscale.array.cpp v3.multiscale.array.hh v3.multiscale.array.cpp plate.hh diff --git a/src/streaming/v2.array.cpp b/src/streaming/v2.array.cpp deleted file mode 100644 index cdfdc4f4..00000000 --- a/src/streaming/v2.array.cpp +++ /dev/null @@ -1,248 +0,0 @@ -#include "v2.array.hh" - -#include "macros.hh" -#include "sink.hh" -#include "zarr.common.hh" - -#include - -#include -#include - -using json = nlohmann::json; - -namespace { -[[nodiscard]] -bool -sample_type_to_dtype(ZarrDataType t, std::string& t_str) - -{ - const std::string dtype_prefix = - std::endian::native == std::endian::big ? ">" : "<"; - - switch (t) { - case ZarrDataType_uint8: - t_str = "|u1"; // byte order does not matter for 1-byte types - break; - case ZarrDataType_uint16: - t_str = dtype_prefix + "u2"; - break; - case ZarrDataType_uint32: - t_str = dtype_prefix + "u4"; - break; - case ZarrDataType_uint64: - t_str = dtype_prefix + "u8"; - break; - case ZarrDataType_int8: - t_str = "|i1"; // byte order does not matter for 1-byte types - break; - case ZarrDataType_int16: - t_str = dtype_prefix + "i2"; - break; - case ZarrDataType_int32: - t_str = dtype_prefix + "i4"; - break; - case ZarrDataType_int64: - t_str = dtype_prefix + "i8"; - break; - case ZarrDataType_float32: - t_str = dtype_prefix + "f4"; - break; - case ZarrDataType_float64: - t_str = dtype_prefix + "f8"; - break; - default: - LOG_ERROR("Unsupported sample type: ", t); - return false; - } - - return true; -} -} // namespace - -zarr::V2Array::V2Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : Array(config, thread_pool, file_handle_pool, s3_connection_pool) -{ -} - -std::vector -zarr::V2Array::metadata_keys_() const -{ - return { ".zarray" }; -} - -bool -zarr::V2Array::make_metadata_() -{ - metadata_strings_.clear(); - - std::string dtype; - if (!sample_type_to_dtype(config_->dtype, dtype)) { - return false; - } - - std::vector array_shape, chunk_shape; - - size_t append_size = frames_written_; - for (auto i = config_->dimensions->ndims() - 3; i > 0; --i) { - const auto& dim = config_->dimensions->at(i); - const auto& array_size_px = dim.array_size_px; - CHECK(array_size_px); - append_size = (append_size + array_size_px - 1) / array_size_px; - } - array_shape.push_back(append_size); - - chunk_shape.push_back(config_->dimensions->final_dim().chunk_size_px); - for (auto i = 1; i < config_->dimensions->ndims(); ++i) { - const auto& dim = config_->dimensions->at(i); - array_shape.push_back(dim.array_size_px); - chunk_shape.push_back(dim.chunk_size_px); - } - - json metadata; - metadata["zarr_format"] = 2; - metadata["shape"] = array_shape; - metadata["chunks"] = chunk_shape; - metadata["dtype"] = dtype; - metadata["fill_value"] = 0; - metadata["order"] = "C"; - metadata["filters"] = nullptr; - metadata["dimension_separator"] = "/"; - - if (config_->compression_params) { - const BloscCompressionParams bcp = *config_->compression_params; - metadata["compressor"] = json{ { "id", "blosc" }, - { "cname", bcp.codec_id }, - { "clevel", bcp.clevel }, - { "shuffle", bcp.shuffle } }; - } else { - metadata["compressor"] = nullptr; - } - - metadata_strings_.emplace(".zarray", metadata.dump(4)); - - return true; -} - -bool -zarr::V2Array::close_impl_() -{ - return true; // no-op -} - -std::string -zarr::V2Array::data_root_() const -{ - return node_path_() + "/" + std::to_string(append_chunk_index_); -} - -const DimensionPartsFun -zarr::V2Array::parts_along_dimension_() const -{ - return chunks_along_dimension; -} - -bool -zarr::V2Array::compress_and_flush_data_() -{ - // construct paths to chunk sinks - CHECK(data_paths_.empty()); - make_data_paths_(); - - const auto n_chunks = chunk_buffers_.size(); - CHECK(data_paths_.size() == n_chunks); - - const auto compression_params = config_->compression_params; - const auto bytes_per_px = bytes_of_type(config_->dtype); - - std::atomic all_successful = 1; - std::vector> futures; - - for (auto i = 0; i < n_chunks; ++i) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [bytes_per_px, - compression_params, - data_path = data_paths_[i], - chunk_buffer = std::move(chunk_buffers_[i].take()), - promise, - &all_successful, - this](std::string& err) mutable // chunk_buffer is mutable - { - bool success = true; - - if (!all_successful) { - promise->set_value(); - err = "Other jobs in batch have failed, not proceeding"; - return false; - } - - try { - // compress the chunk - if (compression_params) { - if (!(success = compress_in_place( - chunk_buffer, *compression_params, bytes_per_px))) { - err = "Failed to compress chunk at path " + data_path; - } - } - - if (success) { - if (auto sink = make_data_sink_(data_path); - sink == nullptr) { - err = "Failed to create sink for " + data_path; - success = false; - } else { - // try to write the chunk to the sink - if (!sink->write(0, chunk_buffer)) { - err = "Failed to write chunk to " + data_path; - success = false; - } else if (!finalize_sink(std::move(sink))) { - err = - "Failed to finalize sink at path " + data_path; - success = false; - } - } - } - } catch (const std::exception& exc) { - err = exc.what(); - success = false; - } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - if (std::string err; !job(err)) { - LOG_ERROR(err); - } - } - } - - // wait for all jobs to finish - for (auto& future : futures) { - future.wait(); - } - - return all_successful; -} - -void -zarr::V2Array::close_sinks_() -{ - data_paths_.clear(); -} - -bool -zarr::V2Array::should_rollover_() const -{ - return true; -} diff --git a/src/streaming/v2.array.hh b/src/streaming/v2.array.hh deleted file mode 100644 index faa6d1dd..00000000 --- a/src/streaming/v2.array.hh +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "array.hh" - -namespace zarr { -class V2Array final : public Array -{ - public: - V2Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); - - private: - std::vector metadata_keys_() const override; - bool make_metadata_() override; - - bool close_impl_() override; - std::string data_root_() const override; - const DimensionPartsFun parts_along_dimension_() const override; - bool compress_and_flush_data_() override; - void close_sinks_() override; - bool should_rollover_() const override; -}; -} // namespace zarr diff --git a/src/streaming/v2.multiscale.array.cpp b/src/streaming/v2.multiscale.array.cpp deleted file mode 100644 index 736d9cb9..00000000 --- a/src/streaming/v2.multiscale.array.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include "macros.hh" -#include "v2.multiscale.array.hh" -#include "zarr.common.hh" - -zarr::V2MultiscaleArray::V2MultiscaleArray( - std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : MultiscaleArray(config, thread_pool, file_handle_pool, s3_connection_pool) -{ - // dimensions may be null in the case of intermediate groups, e.g., the - // A in A/1 - if (config_->dimensions) { - CHECK(create_arrays_()); - } -} - -std::vector -zarr::V2MultiscaleArray::metadata_keys_() const -{ - return { ".zattrs", ".zgroup" }; -} - -bool -zarr::V2MultiscaleArray::make_metadata_() -{ - metadata_strings_.clear(); - - nlohmann::json metadata; - - // .zattrs - if (!arrays_.empty()) { - metadata = { { "multiscales", get_ome_metadata_() } }; - metadata_strings_.emplace(".zattrs", metadata.dump(4)); - } - - // .zgroup - metadata = { { "zarr_format", 2 } }; - metadata_strings_.emplace(".zgroup", metadata.dump(4)); - - return true; -} - -bool -zarr::V2MultiscaleArray::create_arrays_() -{ - arrays_.clear(); - - if (downsampler_) { - const auto& configs = downsampler_->writer_configurations(); - arrays_.resize(configs.size()); - - for (const auto& [lod, config] : configs) { - arrays_[lod] = std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_); - } - } else { - const auto config = make_base_array_config_(); - arrays_.push_back(std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_)); - } - - return true; -} - -nlohmann::json -zarr::V2MultiscaleArray::get_ome_metadata_() const -{ - auto multiscales = make_multiscales_metadata_(); - multiscales[0]["version"] = "0.4"; - multiscales[0]["name"] = "/"; - return multiscales; -} diff --git a/src/streaming/v2.multiscale.array.hh b/src/streaming/v2.multiscale.array.hh deleted file mode 100644 index 1d3cec2d..00000000 --- a/src/streaming/v2.multiscale.array.hh +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "multiscale.array.hh" -#include "v2.array.hh" - -namespace zarr { -class V2MultiscaleArray final : public MultiscaleArray -{ - public: - V2MultiscaleArray(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); - - private: - std::vector metadata_keys_() const override; - bool make_metadata_() override; - - bool create_arrays_() override; - nlohmann::json get_ome_metadata_() const override; -}; -} // namespace zarr \ No newline at end of file From 217ce98a73e58abd01125fc5f9e86b52ba8f292b Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 9 Oct 2025 15:06:15 -0400 Subject: [PATCH 04/38] V3Array -> Array --- src/streaming/CMakeLists.txt | 2 - src/streaming/array.base.cpp | 6 +- src/streaming/array.cpp | 583 ++++++++++++++++- src/streaming/array.hh | 21 +- src/streaming/v3.array.cpp | 605 ------------------ src/streaming/v3.array.hh | 33 - src/streaming/v3.multiscale.array.cpp | 4 +- src/streaming/v3.multiscale.array.hh | 2 +- tests/unit-tests/array-write-even.cpp | 12 +- .../array-write-ragged-append-dim.cpp | 4 +- .../array-write-ragged-internal-dim.cpp | 4 +- 11 files changed, 611 insertions(+), 665 deletions(-) delete mode 100644 src/streaming/v3.array.cpp delete mode 100644 src/streaming/v3.array.hh diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt index 81b24bac..41be57b0 100644 --- a/src/streaming/CMakeLists.txt +++ b/src/streaming/CMakeLists.txt @@ -40,8 +40,6 @@ add_library(${tgt} array.base.cpp array.hh array.cpp - v3.array.hh - v3.array.cpp multiscale.array.hh multiscale.array.cpp v3.multiscale.array.hh diff --git a/src/streaming/array.base.cpp b/src/streaming/array.base.cpp index ad98bccd..53fd84d7 100644 --- a/src/streaming/array.base.cpp +++ b/src/streaming/array.base.cpp @@ -3,9 +3,7 @@ #include "array.base.hh" #include "multiscale.array.hh" #include "macros.hh" -#include "v2.array.hh" -#include "v3.array.hh" -#include "v2.multiscale.array.hh" +#include "array.hh" #include "v3.multiscale.array.hh" zarr::ArrayBase::ArrayBase(std::shared_ptr config, @@ -115,7 +113,7 @@ zarr::make_array(std::shared_ptr config, array = std::make_unique( config, thread_pool, file_handle_pool, s3_connection_pool); } else { - array = std::make_unique( + array = std::make_unique( config, thread_pool, file_handle_pool, s3_connection_pool); } diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index c0664212..7d4be0e6 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -3,10 +3,65 @@ #include "sink.hh" #include "zarr.common.hh" +#include +#include + +#include // std::fill #include #include +#include #include +using json = nlohmann::json; + +namespace { +std::string +sample_type_to_dtype(ZarrDataType t) +{ + switch (t) { + case ZarrDataType_uint8: + return "uint8"; + case ZarrDataType_uint16: + return "uint16"; + case ZarrDataType_uint32: + return "uint32"; + case ZarrDataType_uint64: + return "uint64"; + case ZarrDataType_int8: + return "int8"; + case ZarrDataType_int16: + return "int16"; + case ZarrDataType_int32: + return "int32"; + case ZarrDataType_int64: + return "int64"; + case ZarrDataType_float32: + return "float32"; + case ZarrDataType_float64: + return "float64"; + default: + throw std::runtime_error("Invalid ZarrDataType: " + + std::to_string(static_cast(t))); + } +} + +std::string +shuffle_to_string(uint8_t shuffle) +{ + switch (shuffle) { + case 0: + return "noshuffle"; + case 1: + return "shuffle"; + case 2: + return "bitshuffle"; + default: + throw std::runtime_error("Invalid shuffle value: " + + std::to_string(shuffle)); + } +} +} // namespace + zarr::Array::Array(std::shared_ptr config, std::shared_ptr thread_pool, std::shared_ptr file_handle_pool, @@ -15,11 +70,26 @@ zarr::Array::Array(std::shared_ptr config, , bytes_to_flush_{ 0 } , frames_written_{ 0 } , append_chunk_index_{ 0 } + , current_layer_{ 0 } , is_closing_{ false } { const size_t n_chunks = config_->dimensions->number_of_chunks_in_memory(); EXPECT(n_chunks > 0, "Array has zero chunks in memory"); chunk_buffers_ = std::vector(n_chunks); + + const auto& dims = config_->dimensions; + const auto number_of_shards = dims->number_of_shards(); + const auto chunks_per_shard = dims->chunks_per_shard(); + + shard_file_offsets_.resize(number_of_shards, 0); + shard_tables_.resize(number_of_shards); + + for (auto& table : shard_tables_) { + table.resize(2 * chunks_per_shard); + std::ranges::fill(table, std::numeric_limits::max()); + } + + data_root_ = node_path_() + "/c/" + std::to_string(append_chunk_index_); } size_t @@ -80,6 +150,121 @@ zarr::Array::write_frame(LockedBuffer& data) return bytes_written; } +std::vector +zarr::Array::metadata_keys_() const +{ + return { "zarr.json" }; +} + +bool +zarr::Array::make_metadata_() +{ + metadata_strings_.clear(); + + std::vector array_shape, chunk_shape, shard_shape; + const auto& dims = config_->dimensions; + + size_t append_size = frames_written_; + for (auto i = dims->ndims() - 3; i > 0; --i) { + const auto& dim = dims->at(i); + const auto& array_size_px = dim.array_size_px; + CHECK(array_size_px); + append_size = (append_size + array_size_px - 1) / array_size_px; + } + array_shape.push_back(append_size); + + const auto& final_dim = dims->final_dim(); + chunk_shape.push_back(final_dim.chunk_size_px); + shard_shape.push_back(final_dim.shard_size_chunks * chunk_shape.back()); + for (auto i = 1; i < dims->ndims(); ++i) { + const auto& dim = dims->at(i); + array_shape.push_back(dim.array_size_px); + chunk_shape.push_back(dim.chunk_size_px); + shard_shape.push_back(dim.shard_size_chunks * chunk_shape.back()); + } + + json metadata; + metadata["shape"] = array_shape; + metadata["chunk_grid"] = json::object({ + { "name", "regular" }, + { + "configuration", + json::object({ { "chunk_shape", shard_shape } }), + }, + }); + metadata["chunk_key_encoding"] = json::object({ + { "name", "default" }, + { + "configuration", + json::object({ { "separator", "/" } }), + }, + }); + metadata["fill_value"] = 0; + metadata["attributes"] = json::object(); + metadata["zarr_format"] = 3; + metadata["node_type"] = "array"; + metadata["storage_transformers"] = json::array(); + metadata["data_type"] = sample_type_to_dtype(config_->dtype); + metadata["storage_transformers"] = json::array(); + + std::vector dimension_names(dims->ndims()); + for (auto i = 0; i < dimension_names.size(); ++i) { + dimension_names[i] = dims->at(i).name; + } + metadata["dimension_names"] = dimension_names; + + auto codecs = json::array(); + + auto sharding_indexed = json::object(); + sharding_indexed["name"] = "sharding_indexed"; + + auto configuration = json::object(); + configuration["chunk_shape"] = chunk_shape; + + auto codec = json::object(); + codec["configuration"] = json::object({ { "endian", "little" } }); + codec["name"] = "bytes"; + + auto index_codec = json::object(); + index_codec["configuration"] = json::object({ { "endian", "little" } }); + index_codec["name"] = "bytes"; + + auto crc32_codec = json::object({ { "name", "crc32c" } }); + configuration["index_codecs"] = json::array({ + index_codec, + crc32_codec, + }); + + configuration["index_location"] = "end"; + configuration["codecs"] = json::array({ codec }); + + if (config_->compression_params) { + const auto params = *config_->compression_params; + + auto compression_config = json::object(); + compression_config["blocksize"] = 0; + compression_config["clevel"] = params.clevel; + compression_config["cname"] = params.codec_id; + compression_config["shuffle"] = shuffle_to_string(params.shuffle); + compression_config["typesize"] = bytes_of_type(config_->dtype); + + auto compression_codec = json::object(); + compression_codec["configuration"] = compression_config; + compression_codec["name"] = "blosc"; + configuration["codecs"].push_back(compression_codec); + } + + sharding_indexed["configuration"] = configuration; + + codecs.push_back(sharding_indexed); + + metadata["codecs"] = codecs; + + metadata_strings_.emplace("zarr.json", metadata.dump(4)); + + return true; +} + bool zarr::Array::close_() { @@ -111,6 +296,95 @@ zarr::Array::close_() return retval; } +bool +zarr::Array::close_impl_() +{ + if (current_layer_ == 0) { + return true; + } + + // write the table + const auto& dims = config_->dimensions; + const auto n_shards = dims->number_of_shards(); + std::vector> futures; + + std::atomic all_successful = 1; + + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const std::string data_path = data_paths_[shard_idx]; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + auto* shard_table = shard_tables_.data() + shard_idx; + + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + auto job = [shard_idx, + data_path, + shard_table, + file_offset, + promise, + &all_successful, + this](std::string& err) { + bool success = true; + + try { + std::unique_ptr sink; + + if (data_sinks_.contains( + data_path)) { // sink already constructed + sink = std::move(data_sinks_[data_path]); + data_sinks_.erase(data_path); + } else { + sink = make_data_sink_(data_path); + } + + if (sink == nullptr) { + err = "Failed to create sink for " + data_path; + success = false; + } else { + const auto table_size = + shard_table->size() * sizeof(uint64_t); + std::vector table(table_size + sizeof(uint32_t)); + + // copy the table data + memcpy(table.data(), shard_table->data(), table_size); + const auto* table_ptr = table.data(); + + // compute crc32 checksum of the table + const uint32_t checksum = + crc32c::Crc32c(table_ptr, table_size); + memcpy( + table.data() + table_size, &checksum, sizeof(uint32_t)); + + if (!sink->write(*file_offset, table)) { + err = "Failed to write table and checksum to shard " + + std::to_string(shard_idx); + success = false; + } + } + } catch (const std::exception& exc) { + err = "Failed to flush data: " + std::string(exc.what()); + success = false; + } + + all_successful.fetch_and(success); + promise->set_value(); + + return success; + }; + + // one thread is reserved for processing the frame queue and runs the + // entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + if (std::string err; !job(err)) { + LOG_ERROR(err); + } + } + } + + return all_successful; +} + bool zarr::Array::is_s3_array_() const { @@ -122,7 +396,7 @@ zarr::Array::make_data_paths_() { if (data_paths_.empty()) { data_paths_ = construct_data_paths( - data_root_(), *config_->dimensions, parts_along_dimension_()); + data_root_, *config_->dimensions, shards_along_dimension); } } @@ -268,6 +542,285 @@ zarr::Array::write_frame_to_chunks_(LockedBuffer& data) return bytes_written; } +ByteVector +zarr::Array::consolidate_chunks_(uint32_t shard_index) +{ + const auto& dims = config_->dimensions; + CHECK(shard_index < dims->number_of_shards()); + + const auto chunks_per_shard = dims->chunks_per_shard(); + const auto chunks_in_mem = dims->number_of_chunks_in_memory(); + const auto n_layers = dims->chunk_layers_per_shard(); + + const auto chunks_per_layer = chunks_per_shard / n_layers; + const auto layer_offset = current_layer_ * chunks_per_layer; + const auto chunk_offset = current_layer_ * chunks_in_mem; + + auto& shard_table = shard_tables_[shard_index]; + const auto file_offset = shard_file_offsets_[shard_index]; + shard_table[2 * layer_offset] = file_offset; + + uint64_t last_chunk_offset = shard_table[2 * layer_offset]; + uint64_t last_chunk_size = shard_table[2 * layer_offset + 1]; + size_t shard_size = last_chunk_size; + + for (auto i = 1; i < chunks_per_layer; ++i) { + const auto offset_idx = 2 * (layer_offset + i); + const auto size_idx = offset_idx + 1; + if (shard_table[size_idx] == std::numeric_limits::max()) { + continue; + } + + shard_table[offset_idx] = last_chunk_offset + last_chunk_size; + last_chunk_offset = shard_table[offset_idx]; + last_chunk_size = shard_table[size_idx]; + shard_size += last_chunk_size; + } + + std::vector shard_layer(shard_size); + + const auto chunk_indices_this_layer = + dims->chunk_indices_for_shard_layer(shard_index, current_layer_); + + size_t offset = 0; + for (const auto& idx : chunk_indices_this_layer) { + // this clears the chunk data out of the LockedBuffer + const auto chunk = chunk_buffers_[idx - chunk_offset].take(); + std::copy(chunk.begin(), chunk.end(), shard_layer.begin() + offset); + + offset += chunk.size(); + } + + EXPECT(offset == shard_size, + "Consolidated shard size does not match expected: ", + offset, + " != ", + shard_size); + + return std::move(shard_layer); +} + +bool +zarr::Array::compress_and_flush_data_() +{ + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + // create parent directories if needed + const auto is_s3 = is_s3_array_(); + if (!is_s3) { + const auto parent_paths = get_parent_paths(data_paths_); + CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist + } + + const auto& dims = config_->dimensions; + + const auto n_shards = dims->number_of_shards(); + CHECK(data_paths_.size() == n_shards); + + const auto chunks_in_memory = chunk_buffers_.size(); + const auto n_layers = dims->chunk_layers_per_shard(); + CHECK(n_layers > 0); + + const auto chunk_group_offset = current_layer_ * chunks_in_memory; + + std::atomic all_successful = 1; + + auto write_table = is_closing_ || should_rollover_(); + + std::vector> futures; + + // queue jobs to compress all chunks + const auto bytes_of_raw_chunk = config_->dimensions->bytes_per_chunk(); + const auto bytes_per_px = bytes_of_type(config_->dtype); + + for (auto i = 0; i < chunks_in_memory; ++i) { + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + const auto chunk_idx = i + chunk_group_offset; + const auto shard_idx = dims->shard_index_for_chunk(chunk_idx); + const auto internal_idx = dims->shard_internal_index(chunk_idx); + auto* shard_table = shard_tables_.data() + shard_idx; + + if (config_->compression_params) { + const auto compression_params = config_->compression_params.value(); + + auto job = [&chunk_buffer = chunk_buffers_[i], + bytes_per_px, + compression_params, + shard_table, + shard_idx, + chunk_idx, + internal_idx, + promise, + &all_successful](std::string& err) { + bool success = false; + + try { + if (!chunk_buffer.compress(compression_params, + bytes_per_px)) { + err = "Failed to compress chunk " + + std::to_string(chunk_idx) + " (internal index " + + std::to_string(internal_idx) + " of shard " + + std::to_string(shard_idx) + ")"; + } + + // update shard table with size + shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); + success = true; + } catch (const std::exception& exc) { + err = exc.what(); + } + + promise->set_value(); + + all_successful.fetch_and(static_cast(success)); + return success; + }; + + // one thread is reserved for processing the frame queue and runs + // the entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || + !thread_pool_->push_job(job)) { + std::string err; + if (!job(err)) { + LOG_ERROR(err); + } + } + } else { + // no compression, just update shard table with size + shard_table->at(2 * internal_idx + 1) = bytes_of_raw_chunk; + } + } + + // if we're not compressing, there aren't any futures to wait for + for (auto& future : futures) { + future.wait(); + } + futures.clear(); + + const auto bucket_name = config_->bucket_name; + auto connection_pool = s3_connection_pool_; + + // wait for the chunks in each shard to finish compressing, then defragment + // and write the shard + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const std::string data_path = data_paths_[shard_idx]; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + auto* shard_table = shard_tables_.data() + shard_idx; + + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + auto job = [shard_idx, + is_s3, + data_path, + shard_table, + file_offset, + write_table, + bucket_name, + connection_pool, + promise, + &all_successful, + this](std::string& err) { + bool success = true; + std::unique_ptr sink; + + try { + // consolidate chunks in shard + const auto shard_data = consolidate_chunks_(shard_idx); + + if (data_sinks_.contains(data_path)) { // S3 sink, constructed + sink = std::move(data_sinks_[data_path]); + data_sinks_.erase(data_path); + } else { + sink = make_data_sink_(data_path); + } + + if (sink == nullptr) { + err = "Failed to create sink for " + data_path; + success = false; + } else { + success = sink->write(*file_offset, shard_data); + if (!success) { + err = "Failed to write shard at path " + data_path; + } else { + *file_offset += shard_data.size(); + + if (write_table) { + const size_t table_size = + shard_table->size() * sizeof(uint64_t); + std::vector table( + table_size + sizeof(uint32_t), 0); + + memcpy( + table.data(), shard_table->data(), table_size); + + // compute crc32 checksum of the table + const uint32_t checksum = + crc32c::Crc32c(table.data(), table_size); + memcpy(table.data() + table_size, + &checksum, + sizeof(uint32_t)); + + if (!sink->write(*file_offset, table)) { + err = "Failed to write table and checksum to " + "shard " + + std::to_string(shard_idx); + success = false; + } + } + } + } + } catch (const std::exception& exc) { + err = "Failed to flush data: " + std::string(exc.what()); + success = false; + } + + if (sink != nullptr) { + data_sinks_.emplace(data_path, std::move(sink)); + } + + all_successful.fetch_and(success); + promise->set_value(); + + return success; + }; + + // one thread is reserved for processing the frame queue and runs the + // entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + std::string err; + if (!job(err)) { + LOG_ERROR(err); + } + } + } + + // wait for all threads to finish + for (auto& future : futures) { + future.wait(); + } + + // reset shard tables and file offsets + if (write_table) { + for (auto& table : shard_tables_) { + std::fill( + table.begin(), table.end(), std::numeric_limits::max()); + } + + std::fill(shard_file_offsets_.begin(), shard_file_offsets_.end(), 0); + current_layer_ = 0; + } else { + ++current_layer_; + } + + return static_cast(all_successful); +} + bool zarr::Array::should_flush_() const { @@ -281,6 +834,21 @@ zarr::Array::should_flush_() const return frames_written_ % frames_before_flush == 0; } +bool +zarr::Array::should_rollover_() const +{ + const auto& dims = config_->dimensions; + const auto& append_dim = dims->final_dim(); + size_t frames_before_flush = + append_dim.chunk_size_px * append_dim.shard_size_chunks; + for (auto i = 1; i < dims->ndims() - 2; ++i) { + frames_before_flush *= dims->at(i).array_size_px; + } + + CHECK(frames_before_flush > 0); + return frames_written_ % frames_before_flush == 0; +} + void zarr::Array::rollover_() { @@ -288,4 +856,17 @@ zarr::Array::rollover_() close_sinks_(); ++append_chunk_index_; + data_root_ = node_path_() + "/c/" + std::to_string(append_chunk_index_); +} + +void +zarr::Array::close_sinks_() +{ + data_paths_.clear(); + + for (auto& [path, sink] : data_sinks_) { + EXPECT( + finalize_sink(std::move(sink)), "Failed to finalize sink at ", path); + } + data_sinks_.clear(); } diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 80e2cdc0..4010dae0 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -29,33 +29,40 @@ class Array : public ArrayBase /// Filesystem std::vector data_paths_; + std::unordered_map> data_sinks_; /// Bookkeeping uint64_t bytes_to_flush_; uint32_t frames_written_; uint32_t append_chunk_index_; + std::string data_root_; bool is_closing_; + /// Sharding + uint32_t current_layer_; + std::vector shard_file_offsets_; + std::vector> shard_tables_; + + std::vector metadata_keys_() const override; + bool make_metadata_() override; [[nodiscard]] bool close_() override; - [[nodiscard]] virtual bool close_impl_() = 0; + [[nodiscard]] bool close_impl_(); bool is_s3_array_() const; - virtual std::string data_root_() const = 0; - virtual const DimensionPartsFun parts_along_dimension_() const = 0; void make_data_paths_(); [[nodiscard]] std::unique_ptr make_data_sink_(std::string_view path); void fill_buffers_(); bool should_flush_() const; - virtual bool should_rollover_() const = 0; + bool should_rollover_() const; size_t write_frame_to_chunks_(LockedBuffer& data); - [[nodiscard]] virtual bool compress_and_flush_data_() = 0; + [[nodiscard]] ByteVector consolidate_chunks_(uint32_t shard_index); + [[nodiscard]] bool compress_and_flush_data_(); void rollover_(); - - virtual void close_sinks_() = 0; + void close_sinks_(); friend class MultiscaleArray; }; diff --git a/src/streaming/v3.array.cpp b/src/streaming/v3.array.cpp deleted file mode 100644 index b6e85d8a..00000000 --- a/src/streaming/v3.array.cpp +++ /dev/null @@ -1,605 +0,0 @@ -#include "v3.array.hh" - -#include "macros.hh" -#include "sink.hh" -#include "zarr.common.hh" - -#include -#include - -#include // std::fill -#include -#include - -using json = nlohmann::json; - -namespace { -std::string -sample_type_to_dtype(ZarrDataType t) -{ - switch (t) { - case ZarrDataType_uint8: - return "uint8"; - case ZarrDataType_uint16: - return "uint16"; - case ZarrDataType_uint32: - return "uint32"; - case ZarrDataType_uint64: - return "uint64"; - case ZarrDataType_int8: - return "int8"; - case ZarrDataType_int16: - return "int16"; - case ZarrDataType_int32: - return "int32"; - case ZarrDataType_int64: - return "int64"; - case ZarrDataType_float32: - return "float32"; - case ZarrDataType_float64: - return "float64"; - default: - throw std::runtime_error("Invalid ZarrDataType: " + - std::to_string(static_cast(t))); - } -} - -std::string -shuffle_to_string(uint8_t shuffle) -{ - switch (shuffle) { - case 0: - return "noshuffle"; - case 1: - return "shuffle"; - case 2: - return "bitshuffle"; - default: - throw std::runtime_error("Invalid shuffle value: " + - std::to_string(shuffle)); - } -} -} // namespace - -zarr::V3Array::V3Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : Array(config, thread_pool, file_handle_pool, s3_connection_pool) - , current_layer_{ 0 } -{ - const auto& dims = config_->dimensions; - const auto number_of_shards = dims->number_of_shards(); - const auto chunks_per_shard = dims->chunks_per_shard(); - - shard_file_offsets_.resize(number_of_shards, 0); - shard_tables_.resize(number_of_shards); - - for (auto& table : shard_tables_) { - table.resize(2 * chunks_per_shard); - std::fill( - table.begin(), table.end(), std::numeric_limits::max()); - } -} - -std::vector -zarr::V3Array::metadata_keys_() const -{ - return { "zarr.json" }; -} - -bool -zarr::V3Array::make_metadata_() -{ - metadata_strings_.clear(); - - std::vector array_shape, chunk_shape, shard_shape; - const auto& dims = config_->dimensions; - - size_t append_size = frames_written_; - for (auto i = dims->ndims() - 3; i > 0; --i) { - const auto& dim = dims->at(i); - const auto& array_size_px = dim.array_size_px; - CHECK(array_size_px); - append_size = (append_size + array_size_px - 1) / array_size_px; - } - array_shape.push_back(append_size); - - const auto& final_dim = dims->final_dim(); - chunk_shape.push_back(final_dim.chunk_size_px); - shard_shape.push_back(final_dim.shard_size_chunks * chunk_shape.back()); - for (auto i = 1; i < dims->ndims(); ++i) { - const auto& dim = dims->at(i); - array_shape.push_back(dim.array_size_px); - chunk_shape.push_back(dim.chunk_size_px); - shard_shape.push_back(dim.shard_size_chunks * chunk_shape.back()); - } - - json metadata; - metadata["shape"] = array_shape; - metadata["chunk_grid"] = json::object({ - { "name", "regular" }, - { - "configuration", - json::object({ { "chunk_shape", shard_shape } }), - }, - }); - metadata["chunk_key_encoding"] = json::object({ - { "name", "default" }, - { - "configuration", - json::object({ { "separator", "/" } }), - }, - }); - metadata["fill_value"] = 0; - metadata["attributes"] = json::object(); - metadata["zarr_format"] = 3; - metadata["node_type"] = "array"; - metadata["storage_transformers"] = json::array(); - metadata["data_type"] = sample_type_to_dtype(config_->dtype); - metadata["storage_transformers"] = json::array(); - - std::vector dimension_names(dims->ndims()); - for (auto i = 0; i < dimension_names.size(); ++i) { - dimension_names[i] = dims->at(i).name; - } - metadata["dimension_names"] = dimension_names; - - auto codecs = json::array(); - - auto sharding_indexed = json::object(); - sharding_indexed["name"] = "sharding_indexed"; - - auto configuration = json::object(); - configuration["chunk_shape"] = chunk_shape; - - auto codec = json::object(); - codec["configuration"] = json::object({ { "endian", "little" } }); - codec["name"] = "bytes"; - - auto index_codec = json::object(); - index_codec["configuration"] = json::object({ { "endian", "little" } }); - index_codec["name"] = "bytes"; - - auto crc32_codec = json::object({ { "name", "crc32c" } }); - configuration["index_codecs"] = json::array({ - index_codec, - crc32_codec, - }); - - configuration["index_location"] = "end"; - configuration["codecs"] = json::array({ codec }); - - if (config_->compression_params) { - const auto params = *config_->compression_params; - - auto compression_config = json::object(); - compression_config["blocksize"] = 0; - compression_config["clevel"] = params.clevel; - compression_config["cname"] = params.codec_id; - compression_config["shuffle"] = shuffle_to_string(params.shuffle); - compression_config["typesize"] = bytes_of_type(config_->dtype); - - auto compression_codec = json::object(); - compression_codec["configuration"] = compression_config; - compression_codec["name"] = "blosc"; - configuration["codecs"].push_back(compression_codec); - } - - sharding_indexed["configuration"] = configuration; - - codecs.push_back(sharding_indexed); - - metadata["codecs"] = codecs; - - metadata_strings_.emplace("zarr.json", metadata.dump(4)); - - return true; -} - -ByteVector -zarr::V3Array::consolidate_chunks_(uint32_t shard_index) -{ - const auto& dims = config_->dimensions; - CHECK(shard_index < dims->number_of_shards()); - - const auto chunks_per_shard = dims->chunks_per_shard(); - const auto chunks_in_mem = dims->number_of_chunks_in_memory(); - const auto n_layers = dims->chunk_layers_per_shard(); - - const auto chunks_per_layer = chunks_per_shard / n_layers; - const auto layer_offset = current_layer_ * chunks_per_layer; - const auto chunk_offset = current_layer_ * chunks_in_mem; - - auto& shard_table = shard_tables_[shard_index]; - const auto file_offset = shard_file_offsets_[shard_index]; - shard_table[2 * layer_offset] = file_offset; - - uint64_t last_chunk_offset = shard_table[2 * layer_offset]; - uint64_t last_chunk_size = shard_table[2 * layer_offset + 1]; - size_t shard_size = last_chunk_size; - - for (auto i = 1; i < chunks_per_layer; ++i) { - const auto offset_idx = 2 * (layer_offset + i); - const auto size_idx = offset_idx + 1; - if (shard_table[size_idx] == std::numeric_limits::max()) { - continue; - } - - shard_table[offset_idx] = last_chunk_offset + last_chunk_size; - last_chunk_offset = shard_table[offset_idx]; - last_chunk_size = shard_table[size_idx]; - shard_size += last_chunk_size; - } - - std::vector shard_layer(shard_size); - - const auto chunk_indices_this_layer = - dims->chunk_indices_for_shard_layer(shard_index, current_layer_); - - size_t offset = 0; - for (const auto& idx : chunk_indices_this_layer) { - // this clears the chunk data out of the LockedBuffer - const auto chunk = chunk_buffers_[idx - chunk_offset].take(); - std::copy(chunk.begin(), chunk.end(), shard_layer.begin() + offset); - - offset += chunk.size(); - } - - EXPECT(offset == shard_size, - "Consolidated shard size does not match expected: ", - offset, - " != ", - shard_size); - - return std::move(shard_layer); -} - -bool -zarr::V3Array::close_impl_() -{ - if (current_layer_ == 0) { - return true; - } - - // write the table - const auto& dims = config_->dimensions; - const auto n_shards = dims->number_of_shards(); - std::vector> futures; - - std::atomic all_successful = 1; - - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const std::string data_path = data_paths_[shard_idx]; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto* shard_table = shard_tables_.data() + shard_idx; - - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [shard_idx, - data_path, - shard_table, - file_offset, - promise, - &all_successful, - this](std::string& err) { - bool success = true; - - try { - std::unique_ptr sink; - - if (data_sinks_.contains( - data_path)) { // sink already constructed - sink = std::move(data_sinks_[data_path]); - data_sinks_.erase(data_path); - } else { - sink = make_data_sink_(data_path); - } - - if (sink == nullptr) { - err = "Failed to create sink for " + data_path; - success = false; - } else { - const auto table_size = - shard_table->size() * sizeof(uint64_t); - std::vector table(table_size + sizeof(uint32_t)); - - // copy the table data - memcpy(table.data(), shard_table->data(), table_size); - const auto* table_ptr = table.data(); - - // compute crc32 checksum of the table - const uint32_t checksum = - crc32c::Crc32c(table_ptr, table_size); - memcpy( - table.data() + table_size, &checksum, sizeof(uint32_t)); - - if (!sink->write(*file_offset, table)) { - err = "Failed to write table and checksum to shard " + - std::to_string(shard_idx); - success = false; - } - } - } catch (const std::exception& exc) { - err = "Failed to flush data: " + std::string(exc.what()); - success = false; - } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - if (std::string err; !job(err)) { - LOG_ERROR(err); - } - } - } - - return all_successful; -} - -std::string -zarr::V3Array::data_root_() const -{ - return node_path_() + "/c/" + std::to_string(append_chunk_index_); -} - -const DimensionPartsFun -zarr::V3Array::parts_along_dimension_() const -{ - return shards_along_dimension; -} - -bool -zarr::V3Array::compress_and_flush_data_() -{ - // construct paths to shard sinks if they don't already exist - if (data_paths_.empty()) { - make_data_paths_(); - } - - // create parent directories if needed - const auto is_s3 = is_s3_array_(); - if (!is_s3) { - const auto parent_paths = get_parent_paths(data_paths_); - CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist - } - - const auto& dims = config_->dimensions; - - const auto n_shards = dims->number_of_shards(); - CHECK(data_paths_.size() == n_shards); - - const auto chunks_in_memory = chunk_buffers_.size(); - const auto n_layers = dims->chunk_layers_per_shard(); - CHECK(n_layers > 0); - - const auto chunk_group_offset = current_layer_ * chunks_in_memory; - - std::atomic all_successful = 1; - - auto write_table = is_closing_ || should_rollover_(); - - std::vector> futures; - - // queue jobs to compress all chunks - const auto bytes_of_raw_chunk = config_->dimensions->bytes_per_chunk(); - const auto bytes_per_px = bytes_of_type(config_->dtype); - - for (auto i = 0; i < chunks_in_memory; ++i) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - const auto chunk_idx = i + chunk_group_offset; - const auto shard_idx = dims->shard_index_for_chunk(chunk_idx); - const auto internal_idx = dims->shard_internal_index(chunk_idx); - auto* shard_table = shard_tables_.data() + shard_idx; - - if (config_->compression_params) { - const auto compression_params = config_->compression_params.value(); - - auto job = [&chunk_buffer = chunk_buffers_[i], - bytes_per_px, - compression_params, - shard_table, - shard_idx, - chunk_idx, - internal_idx, - promise, - &all_successful](std::string& err) { - bool success = false; - - try { - if (!chunk_buffer.compress(compression_params, - bytes_per_px)) { - err = "Failed to compress chunk " + - std::to_string(chunk_idx) + " (internal index " + - std::to_string(internal_idx) + " of shard " + - std::to_string(shard_idx) + ")"; - } - - // update shard table with size - shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); - success = true; - } catch (const std::exception& exc) { - err = exc.what(); - } - - promise->set_value(); - - all_successful.fetch_and(static_cast(success)); - return success; - }; - - // one thread is reserved for processing the frame queue and runs - // the entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || - !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - } else { - // no compression, just update shard table with size - shard_table->at(2 * internal_idx + 1) = bytes_of_raw_chunk; - } - } - - // if we're not compressing, there aren't any futures to wait for - for (auto& future : futures) { - future.wait(); - } - futures.clear(); - - const auto bucket_name = config_->bucket_name; - auto connection_pool = s3_connection_pool_; - - // wait for the chunks in each shard to finish compressing, then defragment - // and write the shard - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const std::string data_path = data_paths_[shard_idx]; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto* shard_table = shard_tables_.data() + shard_idx; - - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [shard_idx, - is_s3, - data_path, - shard_table, - file_offset, - write_table, - bucket_name, - connection_pool, - promise, - &all_successful, - this](std::string& err) { - bool success = true; - std::unique_ptr sink; - - try { - // consolidate chunks in shard - const auto shard_data = consolidate_chunks_(shard_idx); - - if (data_sinks_.contains(data_path)) { // S3 sink, constructed - sink = std::move(data_sinks_[data_path]); - data_sinks_.erase(data_path); - } else { - sink = make_data_sink_(data_path); - } - - if (sink == nullptr) { - err = "Failed to create sink for " + data_path; - success = false; - } else { - success = sink->write(*file_offset, shard_data); - if (!success) { - err = "Failed to write shard at path " + data_path; - } else { - *file_offset += shard_data.size(); - - if (write_table) { - const size_t table_size = - shard_table->size() * sizeof(uint64_t); - std::vector table( - table_size + sizeof(uint32_t), 0); - - memcpy( - table.data(), shard_table->data(), table_size); - - // compute crc32 checksum of the table - const uint32_t checksum = - crc32c::Crc32c(table.data(), table_size); - memcpy(table.data() + table_size, - &checksum, - sizeof(uint32_t)); - - if (!sink->write(*file_offset, table)) { - err = "Failed to write table and checksum to " - "shard " + - std::to_string(shard_idx); - success = false; - } - } - } - } - } catch (const std::exception& exc) { - err = "Failed to flush data: " + std::string(exc.what()); - success = false; - } - - if (sink != nullptr) { - data_sinks_.emplace(data_path, std::move(sink)); - } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - } - - // wait for all threads to finish - for (auto& future : futures) { - future.wait(); - } - - // reset shard tables and file offsets - if (write_table) { - for (auto& table : shard_tables_) { - std::fill( - table.begin(), table.end(), std::numeric_limits::max()); - } - - std::fill(shard_file_offsets_.begin(), shard_file_offsets_.end(), 0); - current_layer_ = 0; - } else { - ++current_layer_; - } - - return static_cast(all_successful); -} - -void -zarr::V3Array::close_sinks_() -{ - data_paths_.clear(); - - for (auto& [path, sink] : data_sinks_) { - EXPECT( - finalize_sink(std::move(sink)), "Failed to finalize sink at ", path); - } - data_sinks_.clear(); -} - -bool -zarr::V3Array::should_rollover_() const -{ - const auto& dims = config_->dimensions; - const auto& append_dim = dims->final_dim(); - size_t frames_before_flush = - append_dim.chunk_size_px * append_dim.shard_size_chunks; - for (auto i = 1; i < dims->ndims() - 2; ++i) { - frames_before_flush *= dims->at(i).array_size_px; - } - - CHECK(frames_before_flush > 0); - return frames_written_ % frames_before_flush == 0; -} diff --git a/src/streaming/v3.array.hh b/src/streaming/v3.array.hh deleted file mode 100644 index bd770128..00000000 --- a/src/streaming/v3.array.hh +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include "array.hh" - -namespace zarr { -class V3Array final : public Array -{ - public: - V3Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); - - private: - std::vector shard_file_offsets_; - std::vector> shard_tables_; - uint32_t current_layer_; - - std::unordered_map> data_sinks_; - - std::vector metadata_keys_() const override; - bool make_metadata_() override; - - bool close_impl_() override; - std::string data_root_() const override; - const DimensionPartsFun parts_along_dimension_() const override; - bool compress_and_flush_data_() override; - void close_sinks_() override; - bool should_rollover_() const override; - - ByteVector consolidate_chunks_(uint32_t shard_index); -}; -} // namespace zarr diff --git a/src/streaming/v3.multiscale.array.cpp b/src/streaming/v3.multiscale.array.cpp index 07dce89d..d1ba4760 100644 --- a/src/streaming/v3.multiscale.array.cpp +++ b/src/streaming/v3.multiscale.array.cpp @@ -53,12 +53,12 @@ zarr::V3MultiscaleArray::create_arrays_() arrays_.resize(configs.size()); for (const auto& [lod, config] : configs) { - arrays_[lod] = std::make_unique( + arrays_[lod] = std::make_unique( config, thread_pool_, file_handle_pool_, s3_connection_pool_); } } else { const auto config = make_base_array_config_(); - arrays_.push_back(std::make_unique( + arrays_.push_back(std::make_unique( config, thread_pool_, file_handle_pool_, s3_connection_pool_)); } diff --git a/src/streaming/v3.multiscale.array.hh b/src/streaming/v3.multiscale.array.hh index 3e6c2112..c7977df1 100644 --- a/src/streaming/v3.multiscale.array.hh +++ b/src/streaming/v3.multiscale.array.hh @@ -1,7 +1,7 @@ #pragma once #include "multiscale.array.hh" -#include "v3.array.hh" +#include "array.hh" namespace zarr { class V3MultiscaleArray final : public MultiscaleArray diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index 47978ed5..f60cc442 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -1,4 +1,4 @@ -#include "v3.array.hh" +#include "array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" @@ -102,7 +102,7 @@ main() const ZarrDataType dtype = ZarrDataType_uint16; const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - try { + // try { auto thread_pool = std::make_shared( std::thread::hardware_concurrency(), [](const std::string& err) { LOG_ERROR("Error: ", err); }); @@ -142,7 +142,7 @@ main() level_of_detail); { - auto writer = std::make_unique( + auto writer = std::make_unique( config, thread_pool, std::make_shared(), @@ -211,9 +211,9 @@ main() CHECK(!fs::is_directory(data_root / "c" / std::to_string(shards_in_t))); retval = 0; - } catch (const std::exception& exc) { - LOG_ERROR("Exception: ", exc.what()); - } + // } catch (const std::exception& exc) { + // LOG_ERROR("Exception: ", exc.what()); + // } // cleanup if (fs::exists(base_dir)) { diff --git a/tests/unit-tests/array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp index 89962510..f6d1d8f4 100644 --- a/tests/unit-tests/array-write-ragged-append-dim.cpp +++ b/tests/unit-tests/array-write-ragged-append-dim.cpp @@ -1,4 +1,4 @@ -#include "v3.array.hh" +#include "array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" @@ -113,7 +113,7 @@ main() 4); { - auto writer = std::make_unique( + auto writer = std::make_unique( config, thread_pool, std::make_shared(), diff --git a/tests/unit-tests/array-write-ragged-internal-dim.cpp b/tests/unit-tests/array-write-ragged-internal-dim.cpp index 9dc70268..9c9d52f7 100644 --- a/tests/unit-tests/array-write-ragged-internal-dim.cpp +++ b/tests/unit-tests/array-write-ragged-internal-dim.cpp @@ -1,4 +1,4 @@ -#include "v3.array.hh" +#include "array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" @@ -130,7 +130,7 @@ main() 5); { - auto writer = std::make_unique( + auto writer = std::make_unique( config, thread_pool, std::make_shared(), From d6159f39c0d8d70012c0fdd1755447c0ba40e6c0 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 9 Oct 2025 16:09:42 -0400 Subject: [PATCH 05/38] V3MultiscaleArray -> MultiscaleArray --- src/streaming/CMakeLists.txt | 2 - src/streaming/array.base.cpp | 7 ++- src/streaming/multiscale.array.cpp | 66 +++++++++++++++++++++++ src/streaming/multiscale.array.hh | 9 ++-- src/streaming/v3.multiscale.array.cpp | 77 --------------------------- src/streaming/v3.multiscale.array.hh | 22 -------- 6 files changed, 74 insertions(+), 109 deletions(-) delete mode 100644 src/streaming/v3.multiscale.array.cpp delete mode 100644 src/streaming/v3.multiscale.array.hh diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt index 41be57b0..fe1713a1 100644 --- a/src/streaming/CMakeLists.txt +++ b/src/streaming/CMakeLists.txt @@ -42,8 +42,6 @@ add_library(${tgt} array.cpp multiscale.array.hh multiscale.array.cpp - v3.multiscale.array.hh - v3.multiscale.array.cpp plate.hh plate.cpp $ diff --git a/src/streaming/array.base.cpp b/src/streaming/array.base.cpp index 53fd84d7..391f7068 100644 --- a/src/streaming/array.base.cpp +++ b/src/streaming/array.base.cpp @@ -1,10 +1,9 @@ #include +#include "array.hh" #include "array.base.hh" -#include "multiscale.array.hh" #include "macros.hh" -#include "array.hh" -#include "v3.multiscale.array.hh" +#include "multiscale.array.hh" zarr::ArrayBase::ArrayBase(std::shared_ptr config, std::shared_ptr thread_pool, @@ -110,7 +109,7 @@ zarr::make_array(std::shared_ptr config, std::unique_ptr array; if (multiscale) { - array = std::make_unique( + array = std::make_unique( config, thread_pool, file_handle_pool, s3_connection_pool); } else { array = std::make_unique( diff --git a/src/streaming/multiscale.array.cpp b/src/streaming/multiscale.array.cpp index 7901a37e..bf9f8a77 100644 --- a/src/streaming/multiscale.array.cpp +++ b/src/streaming/multiscale.array.cpp @@ -33,6 +33,12 @@ zarr::MultiscaleArray::MultiscaleArray( : bytes_of_frame(*config_->dimensions, config_->dtype); EXPECT(create_downsampler_(), "Failed to create downsampler"); + + // dimensions may be null in the case of intermediate groups, e.g., the + // A in A/1 + if (config_->dimensions) { + CHECK(create_arrays_()); + } } size_t @@ -70,6 +76,33 @@ zarr::MultiscaleArray::write_frame(LockedBuffer& data) return n_bytes; } +std::vector +zarr::MultiscaleArray::metadata_keys_() const +{ + return { "zarr.json" }; +} + +bool +zarr::MultiscaleArray::make_metadata_() +{ + metadata_sinks_.clear(); + + nlohmann::json metadata = { + { "zarr_format", 3 }, + { "consolidated_metadata", nullptr }, + { "node_type", "group" }, + { "attributes", nlohmann::json::object() }, + }; + + if (!arrays_.empty()) { + metadata["attributes"]["ome"] = get_ome_metadata_(); + } + + metadata_strings_.emplace("zarr.json", metadata.dump(4)); + + return true; +} + bool zarr::MultiscaleArray::close_() { @@ -97,6 +130,39 @@ zarr::MultiscaleArray::close_() return true; } +bool +zarr::MultiscaleArray::create_arrays_() +{ + arrays_.clear(); + + if (downsampler_) { + const auto& configs = downsampler_->writer_configurations(); + arrays_.resize(configs.size()); + + for (const auto& [lod, config] : configs) { + arrays_[lod] = std::make_unique( + config, thread_pool_, file_handle_pool_, s3_connection_pool_); + } + } else { + const auto config = make_base_array_config_(); + arrays_.push_back(std::make_unique( + config, thread_pool_, file_handle_pool_, s3_connection_pool_)); + } + + return true; +} + +nlohmann::json +zarr::MultiscaleArray::get_ome_metadata_() const +{ + nlohmann::json ome; + ome["version"] = "0.5"; + ome["name"] = "/"; + ome["multiscales"] = make_multiscales_metadata_(); + + return ome; +} + bool zarr::MultiscaleArray::create_downsampler_() { diff --git a/src/streaming/multiscale.array.hh b/src/streaming/multiscale.array.hh index 9098c6d9..96a6d4b1 100644 --- a/src/streaming/multiscale.array.hh +++ b/src/streaming/multiscale.array.hh @@ -31,22 +31,23 @@ class MultiscaleArray : public ArrayBase [[nodiscard]] size_t write_frame(LockedBuffer& data) override; protected: - std::unique_ptr downsampler_; - + std::unique_ptr downsampler_; std::vector> arrays_; size_t bytes_per_frame_; + std::vector metadata_keys_() const override; + bool make_metadata_() override; bool close_() override; /** @brief Create array writers. */ - [[nodiscard]] virtual bool create_arrays_() = 0; + [[nodiscard]] bool create_arrays_(); /** * @brief Construct OME metadata for this group. * @return JSON structure with OME metadata for this group. */ - virtual nlohmann::json get_ome_metadata_() const = 0; + nlohmann::json get_ome_metadata_() const; /** * @brief Create a downsampler for multiscale acquisitions. diff --git a/src/streaming/v3.multiscale.array.cpp b/src/streaming/v3.multiscale.array.cpp deleted file mode 100644 index d1ba4760..00000000 --- a/src/streaming/v3.multiscale.array.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include "macros.hh" -#include "v3.multiscale.array.hh" -#include "zarr.common.hh" - -zarr::V3MultiscaleArray::V3MultiscaleArray( - std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : MultiscaleArray(config, thread_pool, file_handle_pool, s3_connection_pool) -{ - // dimensions may be null in the case of intermediate groups, e.g., the - // A in A/1 - if (config_->dimensions) { - CHECK(create_arrays_()); - } -} - -std::vector -zarr::V3MultiscaleArray::metadata_keys_() const -{ - return { "zarr.json" }; -} - -bool -zarr::V3MultiscaleArray::make_metadata_() -{ - metadata_sinks_.clear(); - - nlohmann::json metadata = { - { "zarr_format", 3 }, - { "consolidated_metadata", nullptr }, - { "node_type", "group" }, - { "attributes", nlohmann::json::object() }, - }; - - if (!arrays_.empty()) { - metadata["attributes"]["ome"] = get_ome_metadata_(); - } - - metadata_strings_.emplace("zarr.json", metadata.dump(4)); - - return true; -} - -bool -zarr::V3MultiscaleArray::create_arrays_() -{ - arrays_.clear(); - - if (downsampler_) { - const auto& configs = downsampler_->writer_configurations(); - arrays_.resize(configs.size()); - - for (const auto& [lod, config] : configs) { - arrays_[lod] = std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_); - } - } else { - const auto config = make_base_array_config_(); - arrays_.push_back(std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_)); - } - - return true; -} - -nlohmann::json -zarr::V3MultiscaleArray::get_ome_metadata_() const -{ - nlohmann::json ome; - ome["version"] = "0.5"; - ome["name"] = "/"; - ome["multiscales"] = make_multiscales_metadata_(); - - return ome; -} \ No newline at end of file diff --git a/src/streaming/v3.multiscale.array.hh b/src/streaming/v3.multiscale.array.hh deleted file mode 100644 index c7977df1..00000000 --- a/src/streaming/v3.multiscale.array.hh +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "multiscale.array.hh" -#include "array.hh" - -namespace zarr { -class V3MultiscaleArray final : public MultiscaleArray -{ - public: - V3MultiscaleArray(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); - - private: - std::vector metadata_keys_() const override; - bool make_metadata_() override; - - bool create_arrays_() override; - nlohmann::json get_ome_metadata_() const override; -}; -} // namespace zarr \ No newline at end of file From 9481f08051eeae8a3592fbbfa04e8ab2faf64b50 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 9 Oct 2025 16:12:40 -0400 Subject: [PATCH 06/38] Update changelog --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 343ab5f4..8b2320a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- File handles are now managed by a pool to centrally limit the number of open files (#161) +- File handles are now managed by a pool to centrally limit the number of open files (#161) + +### Removed + +- Support for Zarr V2 has been removed (#165) ## [0.6.0] - [2025-09-24](https://github.com/acquire-project/acquire-zarr/compare/v0.5.2...v0.6.0) From ae6d911b5cdf8ce28eea9f9487c05ae9a919084f Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Sun, 12 Oct 2025 07:37:47 -0400 Subject: [PATCH 07/38] wip --- src/streaming/file.handle.hh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/streaming/file.handle.hh b/src/streaming/file.handle.hh index ea2e94ee..c1ad87df 100644 --- a/src/streaming/file.handle.hh +++ b/src/streaming/file.handle.hh @@ -57,6 +57,9 @@ class FileHandlePool std::unique_ptr get_handle(const std::string& filename, void* flags); + std::shared_ptr get_handle_shared(const std::string& filename, + void* flags); + /** * @brief Return a file handle to the pool. * @details This function should be called when a file handle is no longer From 4d462a263be9da1d753a5226221b7ae707f5ad5d Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 15:51:26 +0200 Subject: [PATCH 08/38] Reinstate ZarrVersion, but remove V2 --- include/acquire.zarr.h | 2 ++ include/zarr.types.h | 6 ++++++ python/acquire-zarr-py.cpp | 17 +++++++++++++++++ python/acquire_zarr/__init__.pyi | 19 +++++++++++++++++++ python/tests/test_stream.py | 1 + 5 files changed, 45 insertions(+) diff --git a/include/acquire.zarr.h b/include/acquire.zarr.h index 97d869e6..0f790464 100644 --- a/include/acquire.zarr.h +++ b/include/acquire.zarr.h @@ -23,6 +23,8 @@ extern "C" const char* store_path; /**< Path to the store. Filesystem path or S3 key prefix. */ ZarrS3Settings* s3_settings; /**< Optional S3 settings for the store. */ + ZarrVersion version; /**< The version of the Zarr format to use. + Reserved for Zarr v4. */ unsigned int max_threads; /**< The maximum number of threads to use in the stream. Set to 0 to use the supported number of concurrent threads. */ diff --git a/include/zarr.types.h b/include/zarr.types.h index 6512e551..e08871da 100644 --- a/include/zarr.types.h +++ b/include/zarr.types.h @@ -26,6 +26,12 @@ extern "C" ZarrStatusCodeCount, } ZarrStatusCode; + typedef enum + { + ZarrVersion_3 = 3, + ZarrVersionCount + } ZarrVersion; + typedef enum { ZarrLogLevel_Debug = 0, diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp index e926ded2..7c2473cc 100644 --- a/python/acquire-zarr-py.cpp +++ b/python/acquire-zarr-py.cpp @@ -1227,6 +1227,8 @@ PYBIND11_MODULE(acquire_zarr, m) "VectorDimension"); py::bind_vector>(m, "VectorArraySettings"); + py::enum_(m, "ZarrVersion").value("V3", ZarrVersion_3); + py::enum_(m, "DataType") .value(data_type_to_str(ZarrDataType_uint8), ZarrDataType_uint8) .value(data_type_to_str(ZarrDataType_uint16), ZarrDataType_uint16) @@ -1940,6 +1942,7 @@ PYBIND11_MODULE(acquire_zarr, m) py::class_(m, "StreamSettings", py::dynamic_attr()) .def(py::init([](std::optional store_path, std::optional s3, + std::optional version, std::optional max_threads, std::optional overwrite, std::optional arrays, @@ -1981,6 +1984,7 @@ PYBIND11_MODULE(acquire_zarr, m) py::kw_only(), py::arg("store_path") = std::nullopt, py::arg("s3") = std::nullopt, + py::arg("version") = std::nullopt, py::arg("max_threads") = std::nullopt, py::arg("overwrite") = std::nullopt, py::arg("arrays") = std::nullopt, @@ -2020,6 +2024,19 @@ PYBIND11_MODULE(acquire_zarr, m) self.set_s3(obj.cast()); } }) + .def_property( + "version", + [](const PyZarrStreamSettings& self) { return ZarrVersion_3; }, + [](PyZarrStreamSettings&, const py::object& version) { + if (!version.is_none()) { + if (const auto ver = version.cast(); + ver != ZarrVersion_3) { + PyErr_SetString(PyExc_ValueError, + "Only ZarrVersion.V3 is supported."); + throw py::error_already_set(); + } + } + }) .def_property("max_threads", &PyZarrStreamSettings::max_threads, &PyZarrStreamSettings::set_max_threads) diff --git a/python/acquire_zarr/__init__.pyi b/python/acquire_zarr/__init__.pyi index 2ca6967f..37ace009 100644 --- a/python/acquire_zarr/__init__.pyi +++ b/python/acquire_zarr/__init__.pyi @@ -30,6 +30,7 @@ __all__ = [ "StreamSettings", "Well", "ZarrStream", + "ZarrVersion", "get_log_level", "set_log_level", ] @@ -430,6 +431,24 @@ class ZarrStream: def get_current_memory_usage(self) -> int: """Get the current memory usage of the stream in bytes.""" +class ZarrVersion: + """ + Zarr format version. + + Attributes: + V3: Zarr format version 3. + """ + + V3: ClassVar[ZarrVersion] # value = + __members__: ClassVar[ + dict[str, ZarrVersion] + ] # value = {'V2': , 'V3': } + + def __eq__(self, other: Any) -> bool: ... + def __getstate__(self) -> int: ... + def __hash__(self) -> int: ... + def __index__(self) -> int: ... + def get_log_level() -> LogLevel: """Get the current log level for the Zarr API""" diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py index 9eb2bf2f..c8d7e108 100644 --- a/python/tests/test_stream.py +++ b/python/tests/test_stream.py @@ -29,6 +29,7 @@ S3Settings, Dimension, DimensionType, + ZarrVersion, LogLevel, DownsamplingMethod, Plate, From 94798e2c3eb833f0d01ee47730df7052d2afdd22 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 15:55:46 +0200 Subject: [PATCH 09/38] Check only V3 is passed into constructor/setter --- python/acquire-zarr-py.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp index 7c2473cc..a1113a31 100644 --- a/python/acquire-zarr-py.cpp +++ b/python/acquire-zarr-py.cpp @@ -1954,6 +1954,13 @@ PYBIND11_MODULE(acquire_zarr, m) if (s3) { settings.set_s3(*s3); } + if (version) { + if (*version != ZarrVersion_3) { + PyErr_SetString(PyExc_ValueError, + "Only ZarrVersion.V3 is supported."); + throw py::error_already_set(); + } + } if (max_threads) { settings.set_max_threads(*max_threads); } From e26da440b7c0fe166f921728048c02604b371f18 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 16:02:07 +0200 Subject: [PATCH 10/38] Test that setting version to 2 raises a RuntimeError --- python/tests/test_settings.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/tests/test_settings.py b/python/tests/test_settings.py index 2778dabb..0a6ad774 100644 --- a/python/tests/test_settings.py +++ b/python/tests/test_settings.py @@ -222,6 +222,13 @@ def test_set_dimensions_in_constructor(): assert settings.dimensions[2].shard_size_chunks == 9 +def test_set_version(settings): + assert settings.version == aqz.ZarrVersion.V3 + + with pytest.raises(RuntimeError): + settings.version = 2 # only V3 is supported + + def test_set_max_threads(settings): assert ( settings.max_threads > 0 From e74d0790cb8058206b62d65e25e5549cd94f59f8 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 16:04:41 +0200 Subject: [PATCH 11/38] Test that setting version to anything but 3 fails to validate (in C++) --- src/streaming/zarr.stream.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index ea3a4430..82dd4ff8 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -1008,6 +1008,11 @@ ZarrStream_s::validate_settings_(const struct ZarrStreamSettings_s* settings) return false; } + if (const auto version = settings->version; version != ZarrVersion_3) { + error_ = "Invalid Zarr version: " + std::to_string(version); + return false; + } + if (settings->store_path == nullptr) { error_ = "Null pointer: store_path"; return false; From b4fc900b0191db7ad5c0ed119e63291ebe9fd790 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 16:14:29 +0200 Subject: [PATCH 12/38] Default to `ZarrVersion_3` when `ZarrStreamSettings.version == 0` --- src/streaming/zarr.stream.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index 82dd4ff8..32a1c258 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -1008,7 +1008,9 @@ ZarrStream_s::validate_settings_(const struct ZarrStreamSettings_s* settings) return false; } - if (const auto version = settings->version; version != ZarrVersion_3) { + if (const auto version = + settings->version == 0 ? ZarrVersion_3 : settings->version; + version != ZarrVersion_3) { error_ = "Invalid Zarr version: " + std::to_string(version); return false; } From 077f0ec6c264e9784d4a63ac3db17c09e8cccca2 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 18:14:52 +0200 Subject: [PATCH 13/38] Break out compression and updating chunk table into their own methods --- src/streaming/array.cpp | 175 +++++++++++++++++++++++----------------- src/streaming/array.hh | 2 + 2 files changed, 102 insertions(+), 75 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 7d4be0e6..b0d97fed 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -603,6 +603,9 @@ zarr::Array::consolidate_chunks_(uint32_t shard_index) bool zarr::Array::compress_and_flush_data_() { + CHECK(compress_chunks_()); + update_table_entries_(); + // construct paths to shard sinks if they don't already exist if (data_paths_.empty()) { make_data_paths_(); @@ -620,91 +623,18 @@ zarr::Array::compress_and_flush_data_() const auto n_shards = dims->number_of_shards(); CHECK(data_paths_.size() == n_shards); - const auto chunks_in_memory = chunk_buffers_.size(); const auto n_layers = dims->chunk_layers_per_shard(); CHECK(n_layers > 0); - const auto chunk_group_offset = current_layer_ * chunks_in_memory; - std::atomic all_successful = 1; auto write_table = is_closing_ || should_rollover_(); - std::vector> futures; - - // queue jobs to compress all chunks - const auto bytes_of_raw_chunk = config_->dimensions->bytes_per_chunk(); - const auto bytes_per_px = bytes_of_type(config_->dtype); - - for (auto i = 0; i < chunks_in_memory; ++i) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - const auto chunk_idx = i + chunk_group_offset; - const auto shard_idx = dims->shard_index_for_chunk(chunk_idx); - const auto internal_idx = dims->shard_internal_index(chunk_idx); - auto* shard_table = shard_tables_.data() + shard_idx; - - if (config_->compression_params) { - const auto compression_params = config_->compression_params.value(); - - auto job = [&chunk_buffer = chunk_buffers_[i], - bytes_per_px, - compression_params, - shard_table, - shard_idx, - chunk_idx, - internal_idx, - promise, - &all_successful](std::string& err) { - bool success = false; - - try { - if (!chunk_buffer.compress(compression_params, - bytes_per_px)) { - err = "Failed to compress chunk " + - std::to_string(chunk_idx) + " (internal index " + - std::to_string(internal_idx) + " of shard " + - std::to_string(shard_idx) + ")"; - } - - // update shard table with size - shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); - success = true; - } catch (const std::exception& exc) { - err = exc.what(); - } - - promise->set_value(); - - all_successful.fetch_and(static_cast(success)); - return success; - }; - - // one thread is reserved for processing the frame queue and runs - // the entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || - !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - } else { - // no compression, just update shard table with size - shard_table->at(2 * internal_idx + 1) = bytes_of_raw_chunk; - } - } - - // if we're not compressing, there aren't any futures to wait for - for (auto& future : futures) { - future.wait(); - } - futures.clear(); - const auto bucket_name = config_->bucket_name; auto connection_pool = s3_connection_pool_; + std::vector> futures; + // wait for the chunks in each shard to finish compressing, then defragment // and write the shard for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { @@ -849,6 +779,101 @@ zarr::Array::should_rollover_() const return frames_written_ % frames_before_flush == 0; } +bool +zarr::Array::compress_chunks_() +{ + if (!config_->compression_params) { + return true; // nothing to do + } + + std::atomic all_successful = 1; + + const auto& params = *config_->compression_params; + const size_t bytes_per_px = bytes_of_type(config_->dtype); + + const auto& dims = config_->dimensions; + + const uint32_t chunks_in_memory = chunk_buffers_.size(); + const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; + + std::vector> futures; + futures.reserve(chunks_in_memory); + + for (size_t i = 0; i < chunks_in_memory; ++i) { + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + const uint32_t chunk_idx = i + chunk_group_offset; + const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); + const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); + auto* shard_table = shard_tables_.data() + shard_idx; + + auto job = [&chunk_buffer = chunk_buffers_[i], + bytes_per_px, + ¶ms, + shard_table, + shard_idx, + chunk_idx, + internal_idx, + promise, + &all_successful](std::string& err) { + bool success = false; + + try { + if (!chunk_buffer.compress(params, bytes_per_px)) { + err = "Failed to compress chunk " + + std::to_string(chunk_idx) + " (internal index " + + std::to_string(internal_idx) + " of shard " + + std::to_string(shard_idx) + ")"; + } + + // update shard table with size + shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); + success = true; + } catch (const std::exception& exc) { + err = exc.what(); + } + + promise->set_value(); + + all_successful.fetch_and(static_cast(success)); + return success; + }; + + // one thread is reserved for processing the frame queue and runs + // the entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + if (std::string err; !job(err)) { + LOG_ERROR(err); + } + } + } + + for (auto& future : futures) { + future.wait(); + } + + return static_cast(all_successful); +} + +void +zarr::Array::update_table_entries_() +{ + const uint32_t chunks_in_memory = chunk_buffers_.size(); + const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; + const auto& dims = config_->dimensions; + + for (auto i = 0; i < chunks_in_memory; ++i) { + const auto& chunk_buffer = chunk_buffers_[i]; + const uint32_t chunk_idx = i + chunk_group_offset; + const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); + const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); + auto& shard_table = shard_tables_[shard_idx]; + + shard_table[2 * internal_idx + 1] = chunk_buffer.size(); + } +} + void zarr::Array::rollover_() { diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 4010dae0..a90aefab 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -61,6 +61,8 @@ class Array : public ArrayBase [[nodiscard]] ByteVector consolidate_chunks_(uint32_t shard_index); [[nodiscard]] bool compress_and_flush_data_(); + [[nodiscard]] bool compress_chunks_(); + void update_table_entries_(); void rollover_(); void close_sinks_(); From 7928c0768f16bdddfeb1358abab9bdb4c3742627 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 14 Oct 2025 19:28:50 +0200 Subject: [PATCH 14/38] Precompute frames before flush --- src/streaming/array.cpp | 10 +--------- src/streaming/array.dimensions.cpp | 22 ++++++++++++++++++++++ src/streaming/array.dimensions.hh | 7 ++++++- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index b0d97fed..84ef6bb5 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -768,15 +768,7 @@ bool zarr::Array::should_rollover_() const { const auto& dims = config_->dimensions; - const auto& append_dim = dims->final_dim(); - size_t frames_before_flush = - append_dim.chunk_size_px * append_dim.shard_size_chunks; - for (auto i = 1; i < dims->ndims() - 2; ++i) { - frames_before_flush *= dims->at(i).array_size_px; - } - - CHECK(frames_before_flush > 0); - return frames_written_ % frames_before_flush == 0; + return frames_written_ % dims->frames_before_flush() == 0; } bool diff --git a/src/streaming/array.dimensions.cpp b/src/streaming/array.dimensions.cpp index 81203551..c7770d30 100644 --- a/src/streaming/array.dimensions.cpp +++ b/src/streaming/array.dimensions.cpp @@ -13,6 +13,9 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, { EXPECT(dims_.size() > 2, "Array must have at least three dimensions."); + frames_before_flush_ = + final_dim().chunk_size_px * final_dim().shard_size_chunks; + for (auto i = 0; i < dims_.size(); ++i) { const auto& dim = dims_[i]; bytes_per_chunk_ *= dim.chunk_size_px; @@ -21,9 +24,18 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, if (i > 0) { number_of_chunks_in_memory_ *= zarr::chunks_along_dimension(dim); number_of_shards_ *= zarr::shards_along_dimension(dim); + frames_before_flush_ *= dim.array_size_px; } } + EXPECT(number_of_chunks_in_memory_ > 0, + "Array must have at least one chunk in memory."); + EXPECT(chunks_per_shard_ > 0, + "Array must have at least one chunk per shard."); + EXPECT(number_of_shards_ > 0, "Array must have at least one shard."); + EXPECT(frames_before_flush_ > 0, + "Array must have at least one frame before flush."); + chunk_indices_for_shard_.resize(number_of_shards_); for (auto i = 0; i < chunks_per_shard_ * number_of_shards_; ++i) { @@ -216,6 +228,16 @@ ArrayDimensions::shard_internal_index(uint32_t chunk_index) const return shard_internal_indices_.at(chunk_index); } +/** + * @brief Get the number of frames before a flush is triggered. + * @return The number of frames before a flush. + */ +uint64_t +ArrayDimensions::frames_before_flush() const +{ + return frames_before_flush_; +} + uint32_t ArrayDimensions::shard_index_for_chunk_(uint32_t chunk_index) const { diff --git a/src/streaming/array.dimensions.hh b/src/streaming/array.dimensions.hh index 3a806c09..a9511027 100644 --- a/src/streaming/array.dimensions.hh +++ b/src/streaming/array.dimensions.hh @@ -115,7 +115,8 @@ class ArrayDimensions uint32_t chunk_layers_per_shard() const; /** - * @brief Get the shard index for a given chunk index, given array dimensions. + * @brief Get the shard index for a given chunk index, given array + * dimensions. * @param chunk_index The index of the chunk. * @return The index of the shard containing the chunk. */ @@ -145,6 +146,8 @@ class ArrayDimensions */ uint32_t shard_internal_index(uint32_t chunk_index) const; + uint64_t frames_before_flush() const; + private: std::vector dims_; ZarrDataType dtype_; @@ -159,6 +162,8 @@ class ArrayDimensions std::unordered_map shard_internal_indices_; std::vector> chunk_indices_for_shard_; + uint64_t frames_before_flush_; + uint32_t shard_index_for_chunk_(uint32_t chunk_index) const; uint32_t shard_internal_index_(uint32_t chunk_index) const; }; From 5b5749c862b1bcf2b78ea5158eb4d6b3cd08fbe6 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 08:51:32 +0200 Subject: [PATCH 15/38] (wip): builds but fails --- src/streaming/CMakeLists.txt | 20 +- src/streaming/array.base.cpp | 96 +---- src/streaming/array.base.hh | 26 +- src/streaming/array.cpp | 354 +++--------------- src/streaming/array.hh | 18 +- src/streaming/file.handle.cpp | 63 +++- src/streaming/file.handle.hh | 30 +- src/streaming/file.sink.cpp | 76 ---- src/streaming/file.sink.hh | 28 -- src/streaming/fs.array.cpp | 233 ++++++++++++ src/streaming/fs.array.hh | 23 ++ src/streaming/fs.multiscale.array.cpp | 55 +++ src/streaming/fs.multiscale.array.hh | 22 ++ src/streaming/fs.storage.cpp | 64 ++++ src/streaming/fs.storage.hh | 40 ++ src/streaming/multiscale.array.cpp | 56 +-- src/streaming/multiscale.array.hh | 12 +- src/streaming/s3.array.cpp | 154 ++++++++ src/streaming/s3.array.hh | 23 ++ src/streaming/s3.multiscale.array.cpp | 55 +++ src/streaming/s3.multiscale.array.hh | 22 ++ src/streaming/{s3.sink.cpp => s3.object.cpp} | 88 +++-- src/streaming/{s3.sink.hh => s3.object.hh} | 31 +- src/streaming/s3.storage.cpp | 77 ++++ src/streaming/s3.storage.hh | 54 +++ src/streaming/sink.cpp | 205 +--------- src/streaming/sink.hh | 149 -------- src/streaming/zarr.stream.cpp | 252 +++++++------ src/streaming/zarr.stream.hh | 12 +- tests/unit-tests/CMakeLists.txt | 4 - tests/unit-tests/array-write-even.cpp | 8 +- .../array-write-ragged-append-dim.cpp | 10 +- .../array-write-ragged-internal-dim.cpp | 8 +- tests/unit-tests/construct-data-paths.cpp | 65 ---- tests/unit-tests/file-sink-write.cpp | 54 --- tests/unit-tests/make-data-sinks.cpp | 292 --------------- tests/unit-tests/s3-sink-write-multipart.cpp | 8 +- tests/unit-tests/s3-sink-write.cpp | 8 +- 38 files changed, 1217 insertions(+), 1578 deletions(-) delete mode 100644 src/streaming/file.sink.cpp delete mode 100644 src/streaming/file.sink.hh create mode 100644 src/streaming/fs.array.cpp create mode 100644 src/streaming/fs.array.hh create mode 100644 src/streaming/fs.multiscale.array.cpp create mode 100644 src/streaming/fs.multiscale.array.hh create mode 100644 src/streaming/fs.storage.cpp create mode 100644 src/streaming/fs.storage.hh create mode 100644 src/streaming/s3.array.cpp create mode 100644 src/streaming/s3.array.hh create mode 100644 src/streaming/s3.multiscale.array.cpp create mode 100644 src/streaming/s3.multiscale.array.hh rename src/streaming/{s3.sink.cpp => s3.object.cpp} (87%) rename src/streaming/{s3.sink.hh => s3.object.hh} (60%) create mode 100644 src/streaming/s3.storage.cpp create mode 100644 src/streaming/s3.storage.hh delete mode 100644 src/streaming/sink.hh delete mode 100644 tests/unit-tests/construct-data-paths.cpp delete mode 100644 tests/unit-tests/file-sink-write.cpp delete mode 100644 tests/unit-tests/make-data-sinks.cpp diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt index fe1713a1..c1c53251 100644 --- a/src/streaming/CMakeLists.txt +++ b/src/streaming/CMakeLists.txt @@ -29,19 +29,27 @@ add_library(${tgt} s3.connection.cpp file.handle.hh file.handle.cpp - sink.hh - sink.cpp - file.sink.hh - file.sink.cpp ${PLATFORM_CPP} - s3.sink.hh - s3.sink.cpp + s3.object.hh + s3.object.cpp array.base.hh array.base.cpp array.hh array.cpp multiscale.array.hh multiscale.array.cpp + fs.storage.hh + fs.storage.cpp + fs.array.hh + fs.array.cpp + fs.multiscale.array.hh + fs.multiscale.array.cpp + s3.storage.hh + s3.storage.cpp + s3.array.hh + s3.array.cpp + s3.multiscale.array.hh + s3.multiscale.array.cpp plate.hh plate.cpp $ diff --git a/src/streaming/array.base.cpp b/src/streaming/array.base.cpp index 391f7068..d2ef195c 100644 --- a/src/streaming/array.base.cpp +++ b/src/streaming/array.base.cpp @@ -6,18 +6,12 @@ #include "multiscale.array.hh" zarr::ArrayBase::ArrayBase(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) + std::shared_ptr thread_pool) : config_(config) , thread_pool_(thread_pool) - , s3_connection_pool_(s3_connection_pool) - , file_handle_pool_(file_handle_pool) { CHECK(config_); // required CHECK(thread_pool_); // required - EXPECT(s3_connection_pool_ != nullptr || file_handle_pool_ != nullptr, - "Either S3 connection pool or file handle pool must be provided."); } std::string @@ -31,94 +25,6 @@ zarr::ArrayBase::node_path_() const return key; } -bool -zarr::ArrayBase::make_metadata_sinks_() -{ - metadata_sinks_.clear(); - - try { - const auto sink_keys = metadata_keys_(); - for (const auto& key : sink_keys) { - const std::string path = node_path_() + "/" + key; - std::unique_ptr sink = - config_->bucket_name - ? make_s3_sink(*config_->bucket_name, path, s3_connection_pool_) - : make_file_sink(path, file_handle_pool_); - - if (sink == nullptr) { - LOG_ERROR("Failed to create metadata sink for ", key); - return false; - } - metadata_sinks_.emplace(key, std::move(sink)); - } - } catch (const std::exception& exc) { - LOG_ERROR("Failed to create metadata sinks: ", exc.what()); - return false; - } - - return true; -} - -bool -zarr::ArrayBase::write_metadata_() -{ - if (!make_metadata_()) { - LOG_ERROR("Failed to make metadata."); - return false; - } - - if (!make_metadata_sinks_()) { - LOG_ERROR("Failed to make metadata sinks."); - return false; - } - - for (const auto& [key, metadata] : metadata_strings_) { - const auto it = metadata_sinks_.find(key); - if (it == metadata_sinks_.end()) { - LOG_ERROR("Metadata sink not found for key: ", key); - return false; - } - - auto& sink = it->second; - if (!sink) { - LOG_ERROR("Metadata sink is null for key: ", key); - return false; - } - - std::span data{ reinterpret_cast(metadata.data()), - metadata.size() }; - if (!sink->write(0, data)) { - LOG_ERROR("Failed to write metadata for key: ", key); - return false; - } - } - - return true; -} - -std::unique_ptr -zarr::make_array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) -{ - // create a multiscale array at the dataset root (node_key is empty) or if - // we have a genuine multiscale dataset - const auto multiscale = - config->node_key.empty() || config->downsampling_method.has_value(); - - std::unique_ptr array; - if (multiscale) { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } else { - array = std::make_unique( - config, thread_pool, file_handle_pool, s3_connection_pool); - } - - return array; -} - bool zarr::finalize_array(std::unique_ptr&& array) { diff --git a/src/streaming/array.base.hh b/src/streaming/array.base.hh index dfa4075a..14d35e34 100644 --- a/src/streaming/array.base.hh +++ b/src/streaming/array.base.hh @@ -2,10 +2,7 @@ #include "array.dimensions.hh" #include "blosc.compression.params.hh" -#include "file.handle.hh" #include "locked.buffer.hh" -#include "s3.connection.hh" -#include "sink.hh" #include "thread.pool.hh" #include "zarr.types.h" @@ -56,9 +53,7 @@ class ArrayBase { public: ArrayBase(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); + std::shared_ptr thread_pool); virtual ~ArrayBase() = default; /** @@ -83,26 +78,25 @@ class ArrayBase protected: std::shared_ptr config_; std::shared_ptr thread_pool_; - std::shared_ptr s3_connection_pool_; - std::shared_ptr file_handle_pool_; - std::unordered_map metadata_strings_; - std::unordered_map> metadata_sinks_; + std::string metadata_str_; std::string node_path_() const; - [[nodiscard]] virtual bool make_metadata_() = 0; - virtual std::vector metadata_keys_() const = 0; - [[nodiscard]] bool make_metadata_sinks_(); - [[nodiscard]] bool write_metadata_(); + [[nodiscard]] virtual bool make_metadata_(std::string& metadata_str) = 0; + [[nodiscard]] virtual bool write_metadata_() = 0; friend bool finalize_array(std::unique_ptr&& array); }; +template std::unique_ptr make_array(std::shared_ptr config, std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); + Args&&... args) +{ + return std::make_unique( + config, thread_pool, std::forward(args)...); +} [[nodiscard]] bool finalize_array(std::unique_ptr&& array); diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 84ef6bb5..baea640a 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -1,10 +1,8 @@ #include "array.hh" #include "macros.hh" -#include "sink.hh" #include "zarr.common.hh" #include -#include #include // std::fill #include @@ -63,10 +61,8 @@ shuffle_to_string(uint8_t shuffle) } // namespace zarr::Array::Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : ArrayBase(config, thread_pool, file_handle_pool, s3_connection_pool) + std::shared_ptr thread_pool) + : ArrayBase(config, thread_pool) , bytes_to_flush_{ 0 } , frames_written_{ 0 } , append_chunk_index_{ 0 } @@ -150,17 +146,9 @@ zarr::Array::write_frame(LockedBuffer& data) return bytes_written; } -std::vector -zarr::Array::metadata_keys_() const -{ - return { "zarr.json" }; -} - bool -zarr::Array::make_metadata_() +zarr::Array::make_metadata_(std::string& metadata_str) { - metadata_strings_.clear(); - std::vector array_shape, chunk_shape, shard_shape; const auto& dims = config_->dimensions; @@ -260,7 +248,7 @@ zarr::Array::make_metadata_() metadata["codecs"] = codecs; - metadata_strings_.emplace("zarr.json", metadata.dump(4)); + metadata_str_ = metadata.dump(4); return true; } @@ -273,20 +261,14 @@ zarr::Array::close_() try { if (bytes_to_flush_ > 0) { CHECK(compress_and_flush_data_()); - } else { - CHECK(close_impl_()); + } else if (current_layer_ > 0) { + CHECK(flush_tables_()); } - close_sinks_(); + close_io_streams_(); if (frames_written_ > 0) { CHECK(write_metadata_()); - for (auto& [key, sink] : metadata_sinks_) { - EXPECT(zarr::finalize_sink(std::move(sink)), - "Failed to finalize metadata sink ", - key); - } } - metadata_sinks_.clear(); retval = true; } catch (const std::exception& exc) { LOG_ERROR("Failed to finalize array writer: ", exc.what()); @@ -296,129 +278,52 @@ zarr::Array::close_() return retval; } -bool -zarr::Array::close_impl_() +void +zarr::Array::make_data_paths_() { - if (current_layer_ == 0) { - return true; - } - - // write the table - const auto& dims = config_->dimensions; - const auto n_shards = dims->number_of_shards(); - std::vector> futures; - - std::atomic all_successful = 1; - - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const std::string data_path = data_paths_[shard_idx]; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto* shard_table = shard_tables_.data() + shard_idx; - - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [shard_idx, - data_path, - shard_table, - file_offset, - promise, - &all_successful, - this](std::string& err) { - bool success = true; - - try { - std::unique_ptr sink; - - if (data_sinks_.contains( - data_path)) { // sink already constructed - sink = std::move(data_sinks_[data_path]); - data_sinks_.erase(data_path); - } else { - sink = make_data_sink_(data_path); - } - - if (sink == nullptr) { - err = "Failed to create sink for " + data_path; - success = false; - } else { - const auto table_size = - shard_table->size() * sizeof(uint64_t); - std::vector table(table_size + sizeof(uint32_t)); - - // copy the table data - memcpy(table.data(), shard_table->data(), table_size); - const auto* table_ptr = table.data(); - - // compute crc32 checksum of the table - const uint32_t checksum = - crc32c::Crc32c(table_ptr, table_size); - memcpy( - table.data() + table_size, &checksum, sizeof(uint32_t)); - - if (!sink->write(*file_offset, table)) { - err = "Failed to write table and checksum to shard " + - std::to_string(shard_idx); - success = false; - } + if (data_paths_.empty()) { + const auto& dimensions = config_->dimensions; + + std::queue paths_queue; + paths_queue.emplace(data_root_); + + // create intermediate paths + for (auto i = 1; // skip the last dimension + i < dimensions->ndims() - 1; // skip the x dimension + ++i) { + const auto& dim = dimensions->at(i); + const auto n_parts = shards_along_dimension(dim); + CHECK(n_parts); + + auto n_paths = paths_queue.size(); + for (auto j = 0; j < n_paths; ++j) { + const auto path = paths_queue.front(); + paths_queue.pop(); + + for (auto k = 0; k < n_parts; ++k) { + const auto kstr = std::to_string(k); + paths_queue.push(path + (path.empty() ? kstr : "/" + kstr)); } - } catch (const std::exception& exc) { - err = "Failed to flush data: " + std::string(exc.what()); - success = false; } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - if (std::string err; !job(err)) { - LOG_ERROR(err); + } + + // create final paths + data_paths_.reserve(paths_queue.size() * + shards_along_dimension(dimensions->width_dim())); + { + const auto& dim = dimensions->width_dim(); + const auto n_parts = shards_along_dimension(dim); + CHECK(n_parts); + + auto n_paths = paths_queue.size(); + for (auto i = 0; i < n_paths; ++i) { + const auto path = paths_queue.front(); + paths_queue.pop(); + for (auto j = 0; j < n_parts; ++j) + data_paths_.push_back(path + "/" + std::to_string(j)); } } } - - return all_successful; -} - -bool -zarr::Array::is_s3_array_() const -{ - return config_->bucket_name.has_value(); -} - -void -zarr::Array::make_data_paths_() -{ - if (data_paths_.empty()) { - data_paths_ = construct_data_paths( - data_root_, *config_->dimensions, shards_along_dimension); - } -} - -std::unique_ptr -zarr::Array::make_data_sink_(std::string_view path) -{ - const auto is_s3 = is_s3_array_(); - - std::unique_ptr sink; - - // create parent directories if needed - if (is_s3) { - const auto bucket_name = *config_->bucket_name; - sink = make_s3_sink(bucket_name, path, s3_connection_pool_); - } else { - const auto parent_paths = get_parent_paths(data_paths_); - CHECK(make_dirs(parent_paths, thread_pool_)); - - sink = make_file_sink(path, file_handle_pool_); - } - - return sink; } void @@ -603,152 +508,27 @@ zarr::Array::consolidate_chunks_(uint32_t shard_index) bool zarr::Array::compress_and_flush_data_() { - CHECK(compress_chunks_()); - update_table_entries_(); - - // construct paths to shard sinks if they don't already exist - if (data_paths_.empty()) { - make_data_paths_(); + if (!compress_chunks_()) { + LOG_ERROR("Failed to compress chunk data"); + return false; } - // create parent directories if needed - const auto is_s3 = is_s3_array_(); - if (!is_s3) { - const auto parent_paths = get_parent_paths(data_paths_); - CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist - } - - const auto& dims = config_->dimensions; - - const auto n_shards = dims->number_of_shards(); - CHECK(data_paths_.size() == n_shards); - - const auto n_layers = dims->chunk_layers_per_shard(); - CHECK(n_layers > 0); - - std::atomic all_successful = 1; - - auto write_table = is_closing_ || should_rollover_(); - - const auto bucket_name = config_->bucket_name; - auto connection_pool = s3_connection_pool_; - - std::vector> futures; - - // wait for the chunks in each shard to finish compressing, then defragment - // and write the shard - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const std::string data_path = data_paths_[shard_idx]; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto* shard_table = shard_tables_.data() + shard_idx; - - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [shard_idx, - is_s3, - data_path, - shard_table, - file_offset, - write_table, - bucket_name, - connection_pool, - promise, - &all_successful, - this](std::string& err) { - bool success = true; - std::unique_ptr sink; - - try { - // consolidate chunks in shard - const auto shard_data = consolidate_chunks_(shard_idx); - - if (data_sinks_.contains(data_path)) { // S3 sink, constructed - sink = std::move(data_sinks_[data_path]); - data_sinks_.erase(data_path); - } else { - sink = make_data_sink_(data_path); - } - - if (sink == nullptr) { - err = "Failed to create sink for " + data_path; - success = false; - } else { - success = sink->write(*file_offset, shard_data); - if (!success) { - err = "Failed to write shard at path " + data_path; - } else { - *file_offset += shard_data.size(); - - if (write_table) { - const size_t table_size = - shard_table->size() * sizeof(uint64_t); - std::vector table( - table_size + sizeof(uint32_t), 0); - - memcpy( - table.data(), shard_table->data(), table_size); - - // compute crc32 checksum of the table - const uint32_t checksum = - crc32c::Crc32c(table.data(), table_size); - memcpy(table.data() + table_size, - &checksum, - sizeof(uint32_t)); - - if (!sink->write(*file_offset, table)) { - err = "Failed to write table and checksum to " - "shard " + - std::to_string(shard_idx); - success = false; - } - } - } - } - } catch (const std::exception& exc) { - err = "Failed to flush data: " + std::string(exc.what()); - success = false; - } - - if (sink != nullptr) { - data_sinks_.emplace(data_path, std::move(sink)); - } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - } + update_table_entries_(); - // wait for all threads to finish - for (auto& future : futures) { - future.wait(); + if (!flush_data_()) { + LOG_ERROR("Failed to flush chunk data"); + return false; } - // reset shard tables and file offsets - if (write_table) { - for (auto& table : shard_tables_) { - std::fill( - table.begin(), table.end(), std::numeric_limits::max()); - } - - std::fill(shard_file_offsets_.begin(), shard_file_offsets_.end(), 0); - current_layer_ = 0; - } else { + if (const auto should_write_table = is_closing_ || should_rollover_(); + should_write_table && !flush_tables_()) { + LOG_ERROR("Failed to flush shard tables"); + return false; + } else if (!should_write_table) { ++current_layer_; } - return static_cast(all_successful); + return true; } bool @@ -871,19 +651,7 @@ zarr::Array::rollover_() { LOG_DEBUG("Rolling over"); - close_sinks_(); + close_io_streams_(); ++append_chunk_index_; data_root_ = node_path_() + "/c/" + std::to_string(append_chunk_index_); } - -void -zarr::Array::close_sinks_() -{ - data_paths_.clear(); - - for (auto& [path, sink] : data_sinks_) { - EXPECT( - finalize_sink(std::move(sink)), "Failed to finalize sink at ", path); - } - data_sinks_.clear(); -} diff --git a/src/streaming/array.hh b/src/streaming/array.hh index a90aefab..40c1852b 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -1,11 +1,8 @@ #pragma once #include "array.base.hh" -#include "blosc.compression.params.hh" #include "definitions.hh" -#include "file.sink.hh" #include "locked.buffer.hh" -#include "s3.connection.hh" #include "thread.pool.hh" namespace zarr { @@ -15,9 +12,7 @@ class Array : public ArrayBase { public: Array(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); + std::shared_ptr thread_pool); size_t memory_usage() const noexcept override; @@ -29,7 +24,6 @@ class Array : public ArrayBase /// Filesystem std::vector data_paths_; - std::unordered_map> data_sinks_; /// Bookkeeping uint64_t bytes_to_flush_; @@ -43,15 +37,11 @@ class Array : public ArrayBase std::vector shard_file_offsets_; std::vector> shard_tables_; - std::vector metadata_keys_() const override; - bool make_metadata_() override; + bool make_metadata_(std::string& metadata) override; [[nodiscard]] bool close_() override; [[nodiscard]] bool close_impl_(); - bool is_s3_array_() const; - void make_data_paths_(); - [[nodiscard]] std::unique_ptr make_data_sink_(std::string_view path); void fill_buffers_(); bool should_flush_() const; @@ -63,8 +53,10 @@ class Array : public ArrayBase [[nodiscard]] bool compress_and_flush_data_(); [[nodiscard]] bool compress_chunks_(); void update_table_entries_(); + [[nodiscard]] virtual bool flush_data_() = 0; + [[nodiscard]] virtual bool flush_tables_() = 0; void rollover_(); - void close_sinks_(); + virtual void close_io_streams_() = 0; friend class MultiscaleArray; }; diff --git a/src/streaming/file.handle.cpp b/src/streaming/file.handle.cpp index 34671a4a..75551627 100644 --- a/src/streaming/file.handle.cpp +++ b/src/streaming/file.handle.cpp @@ -1,7 +1,6 @@ #include "definitions.hh" #include "file.handle.hh" - -#include +#include "macros.hh" void* init_handle(const std::string& filename, void* flags); @@ -33,32 +32,68 @@ zarr::FileHandle::get() const zarr::FileHandlePool::FileHandlePool() : max_active_handles_(get_max_active_handles()) - , n_active_handles_(0) { } -std::unique_ptr +zarr::FileHandlePool::~FileHandlePool() +{ + // wait until the pool has been drained + std::unique_lock lock(mutex_); + while (!handle_map_.empty()) { + if (!evict_idle_handle_()) { + cv_.wait(lock, [&] { return true; }); + } + } +} + +std::shared_ptr zarr::FileHandlePool::get_handle(const std::string& filename, void* flags) { std::unique_lock lock(mutex_); - if (n_active_handles_ >= max_active_handles_) { - cv_.wait(lock, - [this]() { return n_active_handles_ < max_active_handles_; }); + if (const auto it = handle_map_.find(filename); it != handle_map_.end()) { + return it->second->second.lock(); } - ++n_active_handles_; - return std::make_unique(filename, flags); + cv_.wait(lock, [&] { return handles_.size() < max_active_handles_; }); + std::shared_ptr handle(init_handle(filename, flags), [](void* h) { + flush_file(h); + destroy_handle(h); + }); + + EXPECT(handle != nullptr, "Failed to create file handle for " + filename); + + handles_.emplace_front(filename, handle); + handle_map_[filename] = handles_.begin(); + + return handle; } void -zarr::FileHandlePool::return_handle(std::unique_ptr&& handle) +zarr::FileHandlePool::close_handle(const std::string& filename) { std::unique_lock lock(mutex_); + if (const auto it = handle_map_.find(filename); it != handle_map_.end()) { + handles_.erase(it->second); + handle_map_.erase(it); + cv_.notify_all(); + } +} + +bool +zarr::FileHandlePool::evict_idle_handle_() +{ + bool evicted = false; + for (auto it = handles_.begin(); it != handles_.end(); ++it) { + if (it->second.expired()) { + handle_map_.erase(it->first); + handles_.erase(it); + evicted = true; + } + } - if (handle != nullptr && n_active_handles_ > 0) { - --n_active_handles_; + if (evicted) { + cv_.notify_all(); } - // handle will be destroyed when going out of scope - flush_file(handle->get()); + return evicted; } diff --git a/src/streaming/file.handle.hh b/src/streaming/file.handle.hh index c1ad87df..e3281008 100644 --- a/src/streaming/file.handle.hh +++ b/src/streaming/file.handle.hh @@ -1,6 +1,7 @@ #pragma once #include +#include #include // for std::unique_ptr #include #include @@ -44,7 +45,7 @@ class FileHandlePool { public: FileHandlePool(); - ~FileHandlePool() = default; + ~FileHandlePool(); /** * @brief Get a file handle for the specified filename. @@ -52,26 +53,29 @@ class FileHandlePool * been reached, until a handle is returned to the pool. * @param filename The path to the file to open. * @param flags Platform-specific flags for opening the file. - * @return A unique pointer to a FileHandle, or nullptr on failure. + * @return A shared pointer to a file handle, or nullptr on failure. */ - std::unique_ptr get_handle(const std::string& filename, - void* flags); - - std::shared_ptr get_handle_shared(const std::string& filename, - void* flags); + std::shared_ptr get_handle(const std::string& filename, void* flags); /** - * @brief Return a file handle to the pool. - * @details This function should be called when a file handle is no longer - * needed, to allow other threads to acquire a handle. - * @param handle The file handle to return. + * @brief Close the handle for the specified filename, if it exists in the + * pool. This will remove the handle from the pool and close the underlying + * file. + * @param filename The path to the file whose handle should be closed. */ - void return_handle(std::unique_ptr&& handle); + void close_handle(const std::string& filename); private: + using HandleEntry = std::pair>; + using HandleList = std::list; + const uint64_t max_active_handles_; - std::atomic n_active_handles_; + HandleList handles_; + std::unordered_map handle_map_; + std::mutex mutex_; std::condition_variable cv_; + + bool evict_idle_handle_(); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/file.sink.cpp b/src/streaming/file.sink.cpp deleted file mode 100644 index 20fbeec3..00000000 --- a/src/streaming/file.sink.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include "file.sink.hh" -#include "macros.hh" - -#include - -void* -make_flags(); - -void -destroy_flags(void*); - -bool -seek_and_write(void* handle, size_t offset, ConstByteSpan data); - -bool -flush_file(void* handle); - -zarr::FileSink::FileSink(std::string_view filename, - std::shared_ptr file_handle_pool) - : file_handle_pool_(file_handle_pool) - , filename_(filename) - , flags_(make_flags()) -{ - EXPECT(file_handle_pool_ != nullptr, "File handle pool not provided."); -} - -zarr::FileSink::~FileSink() -{ - destroy_flags(flags_); - flags_ = nullptr; -} - -bool -zarr::FileSink::write(size_t offset, ConstByteSpan data) -{ - if (data.data() == nullptr || data.size() == 0) { - return true; - } - - auto handle = file_handle_pool_->get_handle(filename_, flags_); - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", filename_); - return false; - } - - bool retval = false; - try { - retval = seek_and_write(handle->get(), offset, data); - } catch (const std::exception& exc) { - LOG_ERROR("Failed to write to file ", filename_, ": ", exc.what()); - } - - file_handle_pool_->return_handle(std::move(handle)); - - return retval; -} - -bool -zarr::FileSink::flush_() -{ - auto handle = file_handle_pool_->get_handle(filename_, flags_); - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", filename_); - return false; - } - - bool retval = false; - try { - retval = flush_file(handle->get()); - } catch (const std::exception& exc) { - LOG_ERROR("Failed to flush file ", filename_, ": ", exc.what()); - } - file_handle_pool_->return_handle(std::move(handle)); - - return retval; -} \ No newline at end of file diff --git a/src/streaming/file.sink.hh b/src/streaming/file.sink.hh deleted file mode 100644 index 49b06f38..00000000 --- a/src/streaming/file.sink.hh +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "file.handle.hh" -#include "sink.hh" - -#include -#include - -namespace zarr { -class FileSink : public Sink -{ - public: - FileSink(std::string_view filename, - std::shared_ptr file_handle_pool); - ~FileSink() override; - - bool write(size_t offset, ConstByteSpan data) override; - - protected: - bool flush_() override; - - private: - std::shared_ptr file_handle_pool_; - - std::string filename_; - void* flags_; -}; -} // namespace zarr diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp new file mode 100644 index 00000000..48d7b59c --- /dev/null +++ b/src/streaming/fs.array.cpp @@ -0,0 +1,233 @@ +#include "fs.array.hh" +#include "macros.hh" + +#include + +#include +#include +#include + +namespace fs = std::filesystem; + +namespace { +std::vector +get_parent_paths(const std::vector& file_paths) +{ + std::unordered_set unique_paths; + for (const auto& file_path : file_paths) { + unique_paths.emplace(fs::path(file_path).parent_path().string()); + } + + return { unique_paths.begin(), unique_paths.end() }; +} + +bool +make_dirs(const std::vector& dir_paths, + std::shared_ptr thread_pool) +{ + if (dir_paths.empty()) { + return true; + } + EXPECT(thread_pool, "Thread pool not provided."); + + std::atomic all_successful = 1; + const std::unordered_set unique_paths(dir_paths.begin(), dir_paths.end()); + + std::vector> futures; + + for (const auto& path : unique_paths) { + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + auto job = [path, promise, &all_successful](std::string& err) { + bool success = true; + try { + if (fs::is_directory(path) || path.empty()) { + promise->set_value(); + return success; + } + + std::error_code ec; + if (!fs::create_directories(path, ec) && + !fs::is_directory(path)) { + err = "Failed to create directory '" + path + + "': " + ec.message(); + success = false; + } + } catch (const std::exception& exc) { + err = + "Failed to create directory '" + path + "': " + exc.what(); + success = false; + } + + promise->set_value(); + all_successful.fetch_and(success); + return success; + }; + + if (thread_pool->n_threads() == 1 || !thread_pool->push_job(job)) { + if (std::string err; !job(err)) { + LOG_ERROR(err); + } + } + } + + // wait for all jobs to finish + for (auto& future : futures) { + future.wait(); + } + + return all_successful; +} +} // namespace + +zarr::FSArray::FSArray(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr file_handle_pool) + : Array(config, thread_pool) + , FSStorage(file_handle_pool) +{ +} + +bool +zarr::FSArray::write_metadata_() +{ + std::string metadata; + if (!make_metadata_(metadata)) { + LOG_ERROR("Failed to make metadata."); + return false; + } + const std::string path = node_path_() + "/zarr.json"; + + return write_string_(path, metadata, 0); +} + +bool +zarr::FSArray::flush_data_() +{ + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + // create parent directories if needed + const auto parent_paths = get_parent_paths(data_paths_); + CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist + + const auto& dims = config_->dimensions; + + const auto n_shards = dims->number_of_shards(); + CHECK(data_paths_.size() == n_shards); + + std::atomic all_successful = 1; + + std::vector> futures; + + // wait for the chunks in each shard to finish compressing, then defragment + // and write the shard + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const std::string data_path = data_paths_[shard_idx]; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + auto job = + [shard_idx, data_path, file_offset, promise, &all_successful, this]( + std::string& err) { + bool success = true; + + try { + // consolidate chunks in shard + const auto shard_data = consolidate_chunks_(shard_idx); + if (!write_binary_(data_path, shard_data, *file_offset)) { + err = "Failed to write shard at path " + data_path; + success = false; + } else { + *file_offset = shard_data.size(); + } + } catch (const std::exception& exc) { + err = "Failed to flush data: " + std::string(exc.what()); + success = false; + } + + all_successful.fetch_and(success); + promise->set_value(); + + return success; + }; + + // one thread is reserved for processing the frame queue and runs the + // entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + std::string err; + if (!job(err)) { + LOG_ERROR(err); + } + } + } + + // wait for all threads to finish + for (auto& future : futures) { + future.wait(); + } + + return static_cast(all_successful); +} + +bool +zarr::FSArray::flush_tables_() +{ + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + const auto& dims = config_->dimensions; + const auto n_shards = dims->number_of_shards(); + + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const auto* shard_table = shard_tables_.data() + shard_idx; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + + const size_t table_size = shard_table->size() * sizeof(uint64_t); + std::vector table(table_size + sizeof(uint32_t), 0); + + memcpy(table.data(), shard_table->data(), table_size); + + // compute crc32 checksum of the table + const uint32_t checksum = crc32c::Crc32c(table.data(), table_size); + memcpy(table.data() + table_size, &checksum, sizeof(uint32_t)); + + std::string data_path = data_paths_[shard_idx]; + + if (!write_binary_(data_path, table, *file_offset)) { + LOG_ERROR("Failed to write table and checksum to shard ", + shard_idx, + " at path ", + data_path); + return false; + } + } + + // don't reset state if we're closing + if (!is_closing_) { + for (auto& table : shard_tables_) { + std::ranges::fill(table, std::numeric_limits::max()); + } + std::ranges::fill(shard_file_offsets_, 0); + current_layer_ = 0; + } + + return true; +} + +void +zarr::FSArray::close_io_streams_() +{ + for (const auto& path : data_paths_) { + file_handle_pool_->close_handle(path); + } + + data_paths_.clear(); +} diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh new file mode 100644 index 00000000..5b289d10 --- /dev/null +++ b/src/streaming/fs.array.hh @@ -0,0 +1,23 @@ +#pragma once + +#include "array.hh" +#include "fs.storage.hh" + +namespace zarr { +class FSArray final + : public Array + , public FSStorage +{ + public: + FSArray(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr file_handle_pool); + + protected: + bool write_metadata_() override; + + bool flush_data_() override; + bool flush_tables_() override; + void close_io_streams_() override; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/fs.multiscale.array.cpp b/src/streaming/fs.multiscale.array.cpp new file mode 100644 index 00000000..814f0302 --- /dev/null +++ b/src/streaming/fs.multiscale.array.cpp @@ -0,0 +1,55 @@ +#include "fs.multiscale.array.hh" +#include "macros.hh" + +zarr::FSMultiscaleArray::FSMultiscaleArray( + std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr file_handle_pool) + : MultiscaleArray(config, thread_pool) + , FSStorage(file_handle_pool) +{ + // dimensions may be null in the case of intermediate groups, e.g., the + // A in A/1 + if (config_->dimensions) { + CHECK(FSMultiscaleArray::create_arrays_()); + } +} + +bool +zarr::FSMultiscaleArray::write_metadata_() +{ + std::string metadata; + if (!make_metadata_(metadata)) { + LOG_ERROR("Failed to make metadata."); + return false; + } + const std::string path = node_path_() + "/zarr.json"; + + return write_string_(path, metadata, 0); +} + +bool +zarr::FSMultiscaleArray::create_arrays_() +{ + arrays_.clear(); + + try { + if (downsampler_) { + const auto& configs = downsampler_->writer_configurations(); + arrays_.resize(configs.size()); + + for (const auto& [lod, config] : configs) { + arrays_[lod] = std::make_unique( + config, thread_pool_, file_handle_pool_); + } + } else { + arrays_.push_back(std::make_unique( + make_base_array_config_(), thread_pool_, file_handle_pool_)); + } + } catch (const std::exception& e) { + LOG_ERROR(e.what()); + return false; + } + + return true; +} diff --git a/src/streaming/fs.multiscale.array.hh b/src/streaming/fs.multiscale.array.hh new file mode 100644 index 00000000..13b51bf6 --- /dev/null +++ b/src/streaming/fs.multiscale.array.hh @@ -0,0 +1,22 @@ +#pragma once + +#include "fs.array.hh" +#include "fs.storage.hh" +#include "multiscale.array.hh" + +namespace zarr { +class FSMultiscaleArray + : public MultiscaleArray + , public FSStorage +{ + public: + FSMultiscaleArray(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr file_handle_pool); + + protected: + bool write_metadata_() override; + + bool create_arrays_() override; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/fs.storage.cpp b/src/streaming/fs.storage.cpp new file mode 100644 index 00000000..556962b9 --- /dev/null +++ b/src/streaming/fs.storage.cpp @@ -0,0 +1,64 @@ +#include "fs.storage.hh" +#include "macros.hh" + +#include + +void* +make_flags(); + +void +destroy_flags(void* flags); + +bool +seek_and_write(void* handle, size_t offset, ConstByteSpan data); + +zarr::FSStorage::FSStorage(std::shared_ptr file_handle_pool) + : file_handle_pool_(file_handle_pool) +{ +} + +bool +zarr::FSStorage::write_binary_(const std::string& path, + const std::vector& data, + size_t offset) const +{ + void* flags = make_flags(); + auto handle = file_handle_pool_->get_handle(path, flags); + destroy_flags(flags); + + if (handle == nullptr) { + LOG_ERROR("Failed to get file handle for ", path); + return false; + } + + if (!seek_and_write(handle.get(), offset, data)) { + LOG_ERROR("Failed to write binary data to ", path); + return false; + } + + return true; +} + +bool +zarr::FSStorage::write_string_(const std::string& path, + const std::string& data, + size_t offset) const +{ + void* flags = make_flags(); + auto handle = file_handle_pool_->get_handle(path, flags); + destroy_flags(flags); + + if (handle == nullptr) { + LOG_ERROR("Failed to get file handle for ", path); + return false; + } + + std::span span{ reinterpret_cast(data.data()), + data.size() }; + if (!seek_and_write(handle.get(), offset, span)) { + LOG_ERROR("Failed to write string to ", path); + return false; + } + + return true; +} diff --git a/src/streaming/fs.storage.hh b/src/streaming/fs.storage.hh new file mode 100644 index 00000000..74c29de6 --- /dev/null +++ b/src/streaming/fs.storage.hh @@ -0,0 +1,40 @@ +#pragma once + +#include "array.base.hh" +#include "file.handle.hh" + +#include + +namespace zarr { +class FSStorage +{ + public: + explicit FSStorage(std::shared_ptr file_handle_pool); + virtual ~FSStorage() = default; + + protected: + std::shared_ptr file_handle_pool_; + + /** + * @brief Write binary data to a path at the given offset. + * @param path The path to write to. + * @param data The data to write. + * @param offset The offset to write at. + * @return True if the write was successful, false otherwise. + */ + [[nodiscard]] bool write_binary_(const std::string& path, + const std::vector& data, + size_t offset) const; + + /** + * @brief Write a string to a path at the given offset. + * @param path The path to write to. + * @param data The string to write. + * @param offset The offset to write at. + * @return True if the write was successful, false otherwise. + */ + [[nodiscard]] bool write_string_(const std::string& path, + const std::string& data, + size_t offset) const; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/multiscale.array.cpp b/src/streaming/multiscale.array.cpp index bf9f8a77..977d08d6 100644 --- a/src/streaming/multiscale.array.cpp +++ b/src/streaming/multiscale.array.cpp @@ -21,24 +21,15 @@ dimension_type_to_string(ZarrDimensionType type) } } // namespace -zarr::MultiscaleArray::MultiscaleArray( - std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool) - : ArrayBase(config, thread_pool, file_handle_pool, s3_connection_pool) +zarr::MultiscaleArray::MultiscaleArray(std::shared_ptr config, + std::shared_ptr thread_pool) + : ArrayBase(config, thread_pool) { bytes_per_frame_ = config_->dimensions == nullptr ? 0 : bytes_of_frame(*config_->dimensions, config_->dtype); EXPECT(create_downsampler_(), "Failed to create downsampler"); - - // dimensions may be null in the case of intermediate groups, e.g., the - // A in A/1 - if (config_->dimensions) { - CHECK(create_arrays_()); - } } size_t @@ -76,17 +67,9 @@ zarr::MultiscaleArray::write_frame(LockedBuffer& data) return n_bytes; } -std::vector -zarr::MultiscaleArray::metadata_keys_() const -{ - return { "zarr.json" }; -} - bool -zarr::MultiscaleArray::make_metadata_() +zarr::MultiscaleArray::make_metadata_(std::string& metadata_str) { - metadata_sinks_.clear(); - nlohmann::json metadata = { { "zarr_format", 3 }, { "consolidated_metadata", nullptr }, @@ -98,7 +81,7 @@ zarr::MultiscaleArray::make_metadata_() metadata["attributes"]["ome"] = get_ome_metadata_(); } - metadata_strings_.emplace("zarr.json", metadata.dump(4)); + metadata_str = metadata.dump(4); return true; } @@ -118,36 +101,7 @@ zarr::MultiscaleArray::close_() return false; } - for (auto& [key, sink] : metadata_sinks_) { - EXPECT(zarr::finalize_sink(std::move(sink)), - "Failed to finalize metadata sink ", - key); - } - arrays_.clear(); - metadata_sinks_.clear(); - - return true; -} - -bool -zarr::MultiscaleArray::create_arrays_() -{ - arrays_.clear(); - - if (downsampler_) { - const auto& configs = downsampler_->writer_configurations(); - arrays_.resize(configs.size()); - - for (const auto& [lod, config] : configs) { - arrays_[lod] = std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_); - } - } else { - const auto config = make_base_array_config_(); - arrays_.push_back(std::make_unique( - config, thread_pool_, file_handle_pool_, s3_connection_pool_)); - } return true; } diff --git a/src/streaming/multiscale.array.hh b/src/streaming/multiscale.array.hh index 96a6d4b1..a019a90a 100644 --- a/src/streaming/multiscale.array.hh +++ b/src/streaming/multiscale.array.hh @@ -2,7 +2,6 @@ #include "array.hh" #include "downsampler.hh" -#include "sink.hh" #include "thread.pool.hh" #include @@ -14,9 +13,7 @@ class MultiscaleArray : public ArrayBase { public: MultiscaleArray(std::shared_ptr config, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::shared_ptr s3_connection_pool); + std::shared_ptr thread_pool); size_t memory_usage() const noexcept override; @@ -36,12 +33,11 @@ class MultiscaleArray : public ArrayBase size_t bytes_per_frame_; - std::vector metadata_keys_() const override; - bool make_metadata_() override; + bool make_metadata_(std::string& metadata_str) override; bool close_() override; /** @brief Create array writers. */ - [[nodiscard]] bool create_arrays_(); + [[nodiscard]] virtual bool create_arrays_() = 0; /** * @brief Construct OME metadata for this group. @@ -60,7 +56,7 @@ class MultiscaleArray : public ArrayBase [[nodiscard]] virtual nlohmann::json make_multiscales_metadata_() const; /** @brief Create a configuration for a full-resolution Array. */ - std::shared_ptr make_base_array_config_() const; + std::shared_ptr make_base_array_config_() const; /** * @brief Add @p data to downsampler and write downsampled frames to lower- diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp new file mode 100644 index 00000000..b33e74de --- /dev/null +++ b/src/streaming/s3.array.cpp @@ -0,0 +1,154 @@ +#include "macros.hh" +#include "s3.array.hh" + +#include + +#include + +zarr::S3Array::S3Array(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr s3_connection_pool) + : Array(config, thread_pool) + , S3Storage(*config->bucket_name, s3_connection_pool) +{ + CHECK(config_->dimensions); +} + +bool +zarr::S3Array::write_metadata_() +{ + std::string metadata; + if (!make_metadata_(metadata)) { + LOG_ERROR("Failed to make metadata."); + return false; + } + const std::string path = node_path_() + "/zarr.json"; + + return write_string_(path, metadata, 0); +} + +bool +zarr::S3Array::flush_data_() +{ + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + const auto& dims = config_->dimensions; + + const auto n_shards = dims->number_of_shards(); + CHECK(data_paths_.size() == n_shards); + + std::atomic all_successful = 1; + + std::vector> futures; + + // wait for the chunks in each shard to finish compressing, then defragment + // and write the shard + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const std::string data_path = data_paths_[shard_idx]; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + auto job = + [shard_idx, data_path, file_offset, promise, &all_successful, this]( + std::string& err) { + bool success = true; + + try { + // consolidate chunks in shard + const auto shard_data = consolidate_chunks_(shard_idx); + if (!write_binary_(data_path, shard_data, *file_offset)) { + err = "Failed to write shard at path " + data_path; + success = false; + } else { + *file_offset = shard_data.size(); + } + } catch (const std::exception& exc) { + err = "Failed to flush data: " + std::string(exc.what()); + success = false; + } + + all_successful.fetch_and(success); + promise->set_value(); + + return success; + }; + + // one thread is reserved for processing the frame queue and runs the + // entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + std::string err; + if (!job(err)) { + LOG_ERROR(err); + } + } + } + + // wait for all threads to finish + for (auto& future : futures) { + future.wait(); + } + + return static_cast(all_successful); +} + +bool +zarr::S3Array::flush_tables_() +{ + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + const auto& dims = config_->dimensions; + const auto n_shards = dims->number_of_shards(); + + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const auto* shard_table = shard_tables_.data() + shard_idx; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + + const size_t table_size = shard_table->size() * sizeof(uint64_t); + std::vector table(table_size + sizeof(uint32_t), 0); + + memcpy(table.data(), shard_table->data(), table_size); + + // compute crc32 checksum of the table + const uint32_t checksum = crc32c::Crc32c(table.data(), table_size); + memcpy(table.data() + table_size, &checksum, sizeof(uint32_t)); + + std::string data_path = data_paths_[shard_idx]; + + if (!write_binary_(data_path, table, *file_offset)) { + LOG_ERROR("Failed to write table and checksum to shard ", + shard_idx, + " at path ", + data_path); + return false; + } + } + + // don't reset state if we're closing + if (!is_closing_) { + for (auto& table : shard_tables_) { + std::ranges::fill(table, std::numeric_limits::max()); + } + std::ranges::fill(shard_file_offsets_, 0); + current_layer_ = 0; + } + + return true; +} + +void +zarr::S3Array::close_io_streams_() +{ + for (const auto& key : data_paths_) { + s3_objects_.erase(key); + } + + data_paths_.clear(); +} diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh new file mode 100644 index 00000000..3a3b9bb9 --- /dev/null +++ b/src/streaming/s3.array.hh @@ -0,0 +1,23 @@ +#pragma once + +#include "array.hh" +#include "s3.storage.hh" + +namespace zarr { +class S3Array final + : public Array + , public S3Storage +{ + public: + S3Array(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr s3_connection_pool); + + protected: + bool write_metadata_() override; + + bool flush_data_() override; + bool flush_tables_() override; + void close_io_streams_() override; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/s3.multiscale.array.cpp b/src/streaming/s3.multiscale.array.cpp new file mode 100644 index 00000000..5b862011 --- /dev/null +++ b/src/streaming/s3.multiscale.array.cpp @@ -0,0 +1,55 @@ +#include "macros.hh" +#include "s3.multiscale.array.hh" + +zarr::S3MultiscaleArray::S3MultiscaleArray( + std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr s3_connection_pool) + : MultiscaleArray(config, thread_pool) + , S3Storage(*config->bucket_name, s3_connection_pool) +{ + // dimensions may be null in the case of intermediate groups, e.g., the + // A in A/1 + if (config_->dimensions) { + CHECK(S3MultiscaleArray::create_arrays_()); + } +} + +bool +zarr::S3MultiscaleArray::write_metadata_() +{ + std::string metadata; + if (!make_metadata_(metadata)) { + LOG_ERROR("Failed to make metadata."); + return false; + } + const std::string path = node_path_() + "/zarr.json"; + + return write_string_(path, metadata, 0); +} + +bool +zarr::S3MultiscaleArray::create_arrays_() +{ + arrays_.clear(); + + try { + if (downsampler_) { + const auto& configs = downsampler_->writer_configurations(); + arrays_.resize(configs.size()); + + for (const auto& [lod, config] : configs) { + arrays_[lod] = std::make_unique( + config, thread_pool_, s3_connection_pool_); + } + } else { + arrays_.push_back(std::make_unique( + make_base_array_config_(), thread_pool_, s3_connection_pool_)); + } + } catch (const std::exception& e) { + LOG_ERROR(e.what()); + return false; + } + + return true; +} diff --git a/src/streaming/s3.multiscale.array.hh b/src/streaming/s3.multiscale.array.hh new file mode 100644 index 00000000..2f9f8b19 --- /dev/null +++ b/src/streaming/s3.multiscale.array.hh @@ -0,0 +1,22 @@ +#pragma once + +#include "multiscale.array.hh" +#include "s3.array.hh" +#include "s3.storage.hh" + +namespace zarr { +class S3MultiscaleArray + : public MultiscaleArray + , public S3Storage +{ + public: + S3MultiscaleArray(std::shared_ptr config, + std::shared_ptr thread_pool, + std::shared_ptr s3_connection_pool); + + protected: + bool write_metadata_() override; + + bool create_arrays_() override; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/s3.sink.cpp b/src/streaming/s3.object.cpp similarity index 87% rename from src/streaming/s3.sink.cpp rename to src/streaming/s3.object.cpp index adf7a0f9..6ffd212c 100644 --- a/src/streaming/s3.sink.cpp +++ b/src/streaming/s3.object.cpp @@ -1,5 +1,5 @@ #include "macros.hh" -#include "s3.sink.hh" +#include "s3.object.hh" #include #include @@ -8,9 +8,9 @@ #undef min #endif -zarr::S3Sink::S3Sink(std::string_view bucket_name, - std::string_view object_key, - std::shared_ptr connection_pool) +zarr::S3Object::S3Object(std::string_view bucket_name, + std::string_view object_key, + std::shared_ptr connection_pool) : bucket_name_{ bucket_name } , object_key_{ object_key } , connection_pool_{ connection_pool } @@ -21,39 +21,14 @@ zarr::S3Sink::S3Sink(std::string_view bucket_name, } bool -zarr::S3Sink::flush_() +zarr::S3Object::write(ConstByteSpan data, size_t offset) { - if (is_multipart_upload_()) { - const auto& parts = multipart_upload_->parts; - if (nbytes_buffered_ > 0 && !flush_part_()) { - LOG_ERROR("Failed to upload part ", - parts.size() + 1, - " of object ", - object_key_); - return false; - } - if (!finalize_multipart_upload_()) { - LOG_ERROR("Failed to finalize multipart upload of object ", - object_key_); - return false; - } - } else if (nbytes_buffered_ > 0) { - if (!put_object_()) { - LOG_ERROR("Failed to upload object: ", object_key_); - return false; - } + if (is_closed_) { + LOG_ERROR("Cannot write to closed stream"); + return false; } - // cleanup - nbytes_buffered_ = 0; - - return true; -} - -bool -zarr::S3Sink::write(size_t offset, ConstByteSpan data) -{ - if (data.data() == nullptr || data.empty()) { + if (data.empty()) { return true; } @@ -90,7 +65,42 @@ zarr::S3Sink::write(size_t offset, ConstByteSpan data) } bool -zarr::S3Sink::put_object_() +zarr::S3Object::close() +{ + if (is_closed_) { + return true; + } + + if (is_multipart_upload_()) { + const auto& parts = multipart_upload_->parts; + if (nbytes_buffered_ > 0 && !flush_part_()) { + LOG_ERROR("Failed to upload part ", + parts.size() + 1, + " of object ", + object_key_); + return false; + } + if (!finalize_multipart_upload_()) { + LOG_ERROR("Failed to finalize multipart upload of object ", + object_key_); + return false; + } + } else if (nbytes_buffered_ > 0) { + if (!put_object_()) { + LOG_ERROR("Failed to upload object: ", object_key_); + return false; + } + } + + // cleanup + nbytes_buffered_ = 0; + is_closed_ = true; + + return true; +} + +bool +zarr::S3Object::put_object_() { if (nbytes_buffered_ == 0) { return false; @@ -121,13 +131,13 @@ zarr::S3Sink::put_object_() } bool -zarr::S3Sink::is_multipart_upload_() const +zarr::S3Object::is_multipart_upload_() const { return multipart_upload_.has_value(); } void -zarr::S3Sink::create_multipart_upload_() +zarr::S3Object::create_multipart_upload_() { multipart_upload_ = MultiPartUpload{}; @@ -139,7 +149,7 @@ zarr::S3Sink::create_multipart_upload_() } bool -zarr::S3Sink::flush_part_() +zarr::S3Object::flush_part_() { if (nbytes_buffered_ == 0) { return false; @@ -188,7 +198,7 @@ zarr::S3Sink::flush_part_() } bool -zarr::S3Sink::finalize_multipart_upload_() +zarr::S3Object::finalize_multipart_upload_() { auto connection = connection_pool_->get_connection(); diff --git a/src/streaming/s3.sink.hh b/src/streaming/s3.object.hh similarity index 60% rename from src/streaming/s3.sink.hh rename to src/streaming/s3.object.hh index 0edeb51e..e804aaed 100644 --- a/src/streaming/s3.sink.hh +++ b/src/streaming/s3.object.hh @@ -1,6 +1,6 @@ #pragma once -#include "sink.hh" +#include "definitions.hh" #include "s3.connection.hh" #include @@ -8,17 +8,26 @@ #include namespace zarr { -class S3Sink : public Sink +class S3Object { public: - S3Sink(std::string_view bucket_name, - std::string_view object_key, - std::shared_ptr connection_pool); + S3Object(std::string_view bucket_name, + std::string_view object_key, + std::shared_ptr connection_pool); - bool write(size_t offset, ConstByteSpan data) override; + /** @brief Write data to the object at the given offset. + * @param data The data to write. + * @param offset The offset to write at. + * @return True if the write was successful, false otherwise. + */ + [[nodiscard]] bool write(ConstByteSpan data, size_t offset); - protected: - bool flush_() override; + /** + * @brief Close the object, flushing any remaining data. + * @details The object must not be used after calling this function. + * @return True if the object was successfully closed, otherwise false. + */ + [[nodiscard]] bool close(); private: struct MultiPartUpload @@ -27,18 +36,20 @@ class S3Sink : public Sink std::vector parts; }; - static constexpr size_t max_part_size_ = 5 << 20; + static constexpr size_t max_part_size_ = 5ULL << 20; std::string bucket_name_; std::string object_key_; std::shared_ptr connection_pool_; - std::array part_buffer_; + std::array part_buffer_{}; size_t nbytes_buffered_{ 0 }; size_t nbytes_flushed_{ 0 }; std::optional multipart_upload_; + bool is_closed_{ false }; + /** * @brief Upload the object to S3. * @return True if the object was successfully uploaded, otherwise false. diff --git a/src/streaming/s3.storage.cpp b/src/streaming/s3.storage.cpp new file mode 100644 index 00000000..57b785d0 --- /dev/null +++ b/src/streaming/s3.storage.cpp @@ -0,0 +1,77 @@ +#include "macros.hh" +#include "s3.storage.hh" + +#include + +zarr::S3Storage::S3Storage(const std::string& bucket_name, + std::shared_ptr s3_connection_pool) + : bucket_name_(bucket_name) + , s3_connection_pool_(std::move(s3_connection_pool)) +{ + EXPECT(!bucket_name_.empty(), "S3 bucket name is empty"); + EXPECT(s3_connection_pool_, "S3 connection pool is null"); +} + +bool +zarr::S3Storage::finalize_object(const std::string& path) +{ + if (auto it = s3_objects_.find(path); it != s3_objects_.end()) { + if (const auto& s3_object = it->second; s3_object != nullptr) { + if (!s3_object->close()) { + LOG_ERROR("Failed to finalize S3 object at ", path); + return false; + } + } + s3_objects_.erase(it); + + return true; + } + + return false; +} + +void +zarr::S3Storage::create_s3_object_(const std::string& key) +{ + if (!s3_objects_.contains(key)) { + s3_objects_.emplace( + key, + std::make_unique(bucket_name_, key, s3_connection_pool_)); + } +} + +bool +zarr::S3Storage::write_binary_(const std::string& key, + const std::vector& data, + size_t offset) +{ + create_s3_object_(key); + + auto it = s3_objects_.find(key); + EXPECT(it != s3_objects_.end(), "S3 object at ", key, " not found"); + if (auto& s3_object = it->second; s3_object != nullptr) { + return s3_object->write(data, offset); + } + + LOG_ERROR("S3 object at ", key, " is null"); + return false; +} + +bool +zarr::S3Storage::write_string_(const std::string& key, + const std::string& data, + size_t offset) +{ + create_s3_object_(key); + + auto it = s3_objects_.find(key); + EXPECT(it != s3_objects_.end(), "S3 object at ", key, " not found"); + if (auto& s3_object = it->second; s3_object != nullptr) { + std::span span{ reinterpret_cast(data.data()), + data.size() }; + return s3_object->write(span, offset); + } + + LOG_ERROR("S3 object at ", key, " is null"); + return false; +} diff --git a/src/streaming/s3.storage.hh b/src/streaming/s3.storage.hh new file mode 100644 index 00000000..95e785fa --- /dev/null +++ b/src/streaming/s3.storage.hh @@ -0,0 +1,54 @@ +#pragma once + +#include "s3.object.hh" + +#include + +namespace zarr { +class S3Storage +{ + public: + S3Storage(const std::string& bucket_name, + std::shared_ptr s3_connection_pool); + virtual ~S3Storage() = default; + + /** + * @brief Finalize the object at the given path. + * @details This will ensure that any buffered data is flushed and the + * object is properly closed. + * @param path The path of the object to finalize. + * @return True if the object was successfully finalized, otherwise false. + */ + [[nodiscard]] bool finalize_object(const std::string& path); + + protected: + const std::string bucket_name_; + std::shared_ptr s3_connection_pool_; + + void create_s3_object_(const std::string& key); + + /** + * @brief Write binary data to a path at the given offset. + * @param key The path to write to. + * @param data The data to write. + * @param offset The offset to write at. + * @return True if the write was successful, false otherwise. + */ + [[nodiscard]] bool write_binary_(const std::string& key, + const std::vector& data, + size_t offset); + + /** + * @brief Write a string to a path at the given offset. + * @param key The path to write to. + * @param data The string to write. + * @param offset The offset to write at. + * @return True if the write was successful, false otherwise. + */ + [[nodiscard]] bool write_string_(const std::string& key, + const std::string& data, + size_t offset); + + std::unordered_map> s3_objects_; +}; +} // namespace zarr \ No newline at end of file diff --git a/src/streaming/sink.cpp b/src/streaming/sink.cpp index 851e9acd..0d947cec 100644 --- a/src/streaming/sink.cpp +++ b/src/streaming/sink.cpp @@ -27,72 +27,6 @@ bucket_exists(std::string_view bucket_name, return bucket_exists; } -bool -make_file_sinks(std::vector& file_paths, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::vector>& sinks) -{ - if (file_paths.empty()) { - return true; - } - - const auto parents = zarr::get_parent_paths(file_paths); - if (!zarr::make_dirs(parents, thread_pool)) { - LOG_ERROR("Failed to make parent directories"); - return false; - } - - std::atomic all_successful = 1; - - const auto n_files = file_paths.size(); - sinks.resize(n_files); - std::fill(sinks.begin(), sinks.end(), nullptr); - std::vector> futures; - - for (auto i = 0; i < n_files; ++i) { - const auto filename = file_paths[i]; - std::unique_ptr* psink = sinks.data() + i; - - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = - [filename, file_handle_pool, psink, promise, &all_successful]( - std::string& err) -> bool { - bool success = false; - - try { - *psink = - std::make_unique(filename, file_handle_pool); - success = true; - } catch (const std::exception& exc) { - err = "Failed to create file '" + filename + "': " + exc.what(); - } - - promise->set_value(); - all_successful.fetch_and(success); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool->n_threads() == 1 || !thread_pool->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - } - - for (auto& future : futures) { - future.wait(); - } - - return (bool)all_successful; -} - bool make_s3_sinks(std::string_view bucket_name, const std::vector& object_keys, @@ -115,7 +49,7 @@ make_s3_sinks(std::string_view bucket_name, const auto n_objects = object_keys.size(); sinks.resize(n_objects); for (auto i = 0; i < n_objects; ++i) { - sinks[i] = std::make_unique( + sinks[i] = std::make_unique( bucket_name, object_keys[i], connection_pool); } @@ -144,120 +78,11 @@ zarr::construct_data_paths(std::string_view base_path, const ArrayDimensions& dimensions, const DimensionPartsFun& parts_along_dimension) { - std::queue paths_queue; - paths_queue.emplace(base_path); - - // create intermediate paths - for (auto i = 1; // skip the last dimension - i < dimensions.ndims() - 1; // skip the x dimension - ++i) { - const auto& dim = dimensions.at(i); - const auto n_parts = parts_along_dimension(dim); - CHECK(n_parts); - auto n_paths = paths_queue.size(); - for (auto j = 0; j < n_paths; ++j) { - const auto path = paths_queue.front(); - paths_queue.pop(); - - for (auto k = 0; k < n_parts; ++k) { - const auto kstr = std::to_string(k); - paths_queue.push(path + (path.empty() ? kstr : "/" + kstr)); - } - } - } - - // create final paths - std::vector paths_out; - paths_out.reserve(paths_queue.size() * - parts_along_dimension(dimensions.width_dim())); - { - const auto& dim = dimensions.width_dim(); - const auto n_parts = parts_along_dimension(dim); - CHECK(n_parts); - - auto n_paths = paths_queue.size(); - for (auto i = 0; i < n_paths; ++i) { - const auto path = paths_queue.front(); - paths_queue.pop(); - for (auto j = 0; j < n_parts; ++j) - paths_out.push_back(path + "/" + std::to_string(j)); - } - } return paths_out; } -std::vector -zarr::get_parent_paths(const std::vector& file_paths) -{ - std::unordered_set unique_paths; - for (const auto& file_path : file_paths) { - unique_paths.emplace(fs::path(file_path).parent_path().string()); - } - - return { unique_paths.begin(), unique_paths.end() }; -} - -bool -zarr::make_dirs(const std::vector& dir_paths, - std::shared_ptr thread_pool) -{ - if (dir_paths.empty()) { - return true; - } - EXPECT(thread_pool, "Thread pool not provided."); - - std::atomic all_successful = 1; - const std::unordered_set unique_paths(dir_paths.begin(), dir_paths.end()); - - std::vector> futures; - - for (const auto& path : unique_paths) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - auto job = [path, promise, &all_successful](std::string& err) { - bool success = true; - try { - if (fs::is_directory(path) || path.empty()) { - promise->set_value(); - return success; - } - - std::error_code ec; - if (!fs::create_directories(path, ec) && - !fs::is_directory(path)) { - err = "Failed to create directory '" + path + - "': " + ec.message(); - success = false; - } - } catch (const std::exception& exc) { - err = - "Failed to create directory '" + path + "': " + exc.what(); - success = false; - } - - promise->set_value(); - all_successful.fetch_and(success); - return success; - }; - - if (thread_pool->n_threads() == 1 || !thread_pool->push_job(job)) { - if (std::string err; !job(err)) { - LOG_ERROR(err); - } - } - } - - // wait for all jobs to finish - for (auto& future : futures) { - future.wait(); - } - - return all_successful; -} - std::unique_ptr zarr::make_file_sink(std::string_view file_path, std::shared_ptr file_handle_pool) @@ -286,32 +111,6 @@ zarr::make_file_sink(std::string_view file_path, return std::make_unique(file_path, file_handle_pool); } -bool -zarr::make_data_file_sinks(std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::vector>& part_sinks) -{ - if (base_path.starts_with("file://")) { - base_path = base_path.substr(7); - } - - EXPECT(!base_path.empty(), "Base path must not be empty."); - - std::vector paths; - try { - paths = - construct_data_paths(base_path, dimensions, parts_along_dimension); - } catch (const std::exception& exc) { - LOG_ERROR("Failed to create dataset paths: ", exc.what()); - return false; - } - - return make_file_sinks(paths, thread_pool, file_handle_pool, part_sinks); -} - std::unique_ptr zarr::make_s3_sink(std::string_view bucket_name, std::string_view object_key, @@ -325,7 +124,7 @@ zarr::make_s3_sink(std::string_view bucket_name, return nullptr; } - return std::make_unique(bucket_name, object_key, connection_pool); + return std::make_unique(bucket_name, object_key, connection_pool); } bool diff --git a/src/streaming/sink.hh b/src/streaming/sink.hh deleted file mode 100644 index 3c514d19..00000000 --- a/src/streaming/sink.hh +++ /dev/null @@ -1,149 +0,0 @@ -#pragma once - -#include "definitions.hh" -#include "s3.connection.hh" -#include "thread.pool.hh" -#include "array.dimensions.hh" - -#include // size_t -#include -#include // std::unique_ptr - -namespace zarr { -class Sink -{ - public: - virtual ~Sink() = default; - - /** - * @brief Write data to the sink. - * @param offset The offset in the sink to write to. - * @param data The buffer to write to the sink. - * @param bytes_of_buf The number of bytes to write from @p buf. - * @return True if the write was successful, false otherwise. - */ - [[nodiscard]] virtual bool write(size_t offset, ConstByteSpan data) = 0; - - protected: - /** - * @brief Flush any buffered data to the sink. - * @note This should ONLY be called when finalizing the sink. - * @return True if the flush was successful, false otherwise. - */ - [[nodiscard]] virtual bool flush_() = 0; - - friend bool finalize_sink(std::unique_ptr&& sink); -}; - -/** - * @brief Finalize and destroy @p sink. - * @note @p sink is no longer accessible after a successful call to this - * function. - * @param[in] sink The Sink to finalize. - * @return True if and only if the Sink was finalized successfully. - */ -bool -finalize_sink(std::unique_ptr&& sink); - -/** - * @brief Construct paths for data sinks, given the dimensions and a function - * to determine the number of parts along a dimension. - * @param base_path The base path for the dataset. - * @param dimensions The dimensions of the dataset. - * @param parts_along_dimension Function to determine the number of parts along - * a dimension. - * @return A vector of paths for the data sinks. - */ -std::vector -construct_data_paths(std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension); - -/** - * @brief Get unique paths to the parent directories of each file in @p - * file_paths. - * @param file_paths Collection of paths to files. - * @return Collection of unique parent directories. - */ -std::vector -get_parent_paths(const std::vector& file_paths); - -/** - * @brief Parallel create directories for a collection of paths. - * @param dir_paths The directories to create. - * @param thread_pool The thread pool to use for parallel creation. - * @return True iff all directories were created successfully. - */ -bool -make_dirs(const std::vector& dir_paths, - std::shared_ptr thread_pool); - -/** - * @brief Create a file sink from a path. - * @param file_path The path to the file. - * @param file_handle_pool Pointer to a pool of file handles. - * @return Pointer to the sink created, or nullptr if the file cannot be - * opened. - * @throws std::runtime_error if the file path is not valid. - */ -std::unique_ptr -make_file_sink(std::string_view file_path, - std::shared_ptr file_handle_pool); - -/** - * @brief Create a collection of file sinks for a Zarr dataset. - * @param[in] base_path The path to the base directory for the dataset. - * @param[in] dimensions The dimensions of the dataset. - * @param[in] parts_along_dimension Function to determine the number of - * parts (i.e., shards or chunks) along a dimension. - * @param[in] thread_pool Pointer to a thread pool object. Used to create files - * in parallel. - * @param file_handle_pool Pointer to a pool of file handles. - * @param[out] part_sinks The sinks created. - * @return True iff all file sinks were created successfully. - * @throws std::runtime_error if @p base_path is not valid, or if the number - * of parts along a dimension is zero. - */ -[[nodiscard]] bool -make_data_file_sinks(std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension, - std::shared_ptr thread_pool, - std::shared_ptr file_handle_pool, - std::vector>& part_sinks); - -/** - * @brief Create a sink from an S3 bucket name and object key. - * @param bucket_name The name of the bucket in which the object is stored. - * @param object_key The key of the object to write to. - * @param connection_pool Pointer to a pool of existing S3 connections. - * @return Pointer to the sink created, or nullptr if the bucket does not - * exist. - * @throws std::runtime_error if the bucket name or object key is not valid, - * or if there is no connection pool. - */ -std::unique_ptr -make_s3_sink(std::string_view bucket_name, - std::string_view object_key, - std::shared_ptr connection_pool); - -/** - * @brief Create a collection of S3 sinks for a Zarr dataset. - * @param[in] bucket_name The name of the bucket in which the dataset is - * stored. - * @param[in] base_path The path to the base directory for the dataset. - * @param[in] dimensions The dimensions of the dataset. - * @param[in] parts_along_dimension Function to determine the number of - * parts (i.e., shards or chunks) along a dimension. - * @param[in] connection_pool Pointer to a pool of existing S3 connections. - * @param[out] part_sinks The sinks created. - * @return True iff all file sinks were created successfully. - */ -[[nodiscard]] bool -make_data_s3_sinks(std::string_view bucket_name, - std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension, - std::shared_ptr connection_pool, - std::vector>& part_sinks); -} // namespace zarr diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index 32a1c258..885e9b24 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -1,7 +1,10 @@ #include "acquire.zarr.h" #include "array.base.hh" +#include "fs.array.hh" +#include "fs.multiscale.array.hh" #include "macros.hh" -#include "sink.hh" +#include "s3.array.hh" +#include #include "zarr.common.hh" #include "zarr.stream.hh" @@ -938,46 +941,46 @@ ZarrStream_s::write_custom_metadata(std::string_view custom_metadata, return ZarrStatusCode_InvalidArgument; } - // check if we have already written custom metadata - if (!custom_metadata_sink_) { - const std::string metadata_key = "acquire.json"; - std::string base_path = store_path_; - if (base_path.starts_with("file://")) { - base_path = base_path.substr(7); - } - const auto prefix = base_path.empty() ? "" : base_path + "/"; - const auto sink_path = prefix + metadata_key; - - if (is_s3_acquisition_()) { - custom_metadata_sink_ = zarr::make_s3_sink( - s3_settings_->bucket_name, sink_path, s3_connection_pool_); - } else { - custom_metadata_sink_ = - zarr::make_file_sink(sink_path, file_handle_pool_); - } - } else if (!overwrite) { // custom metadata already written, don't overwrite - LOG_ERROR("Custom metadata already written, use overwrite flag"); - return ZarrStatusCode_WillNotOverwrite; - } - - if (!custom_metadata_sink_) { - LOG_ERROR("Custom metadata sink not found"); - return ZarrStatusCode_InternalError; - } - - const auto metadata_json = nlohmann::json::parse(custom_metadata, - nullptr, // callback - false, // allow exceptions - true // ignore comments - ); - - const auto metadata_str = metadata_json.dump(4); - std::span data{ reinterpret_cast(metadata_str.data()), - metadata_str.size() }; - if (!custom_metadata_sink_->write(0, data)) { - LOG_ERROR("Error writing custom metadata"); - return ZarrStatusCode_IOError; - } + // // check if we have already written custom metadata + // if (!custom_metadata_sink_) { + // const std::string metadata_key = "acquire.json"; + // std::string base_path = store_path_; + // if (base_path.starts_with("file://")) { + // base_path = base_path.substr(7); + // } + // const auto prefix = base_path.empty() ? "" : base_path + "/"; + // const auto sink_path = prefix + metadata_key; + // + // if (is_s3_acquisition_()) { + // custom_metadata_sink_ = zarr::make_s3_sink( + // s3_settings_->bucket_name, sink_path, s3_connection_pool_); + // } else { + // custom_metadata_sink_ = + // zarr::make_file_sink(sink_path, file_handle_pool_); + // } + // } else if (!overwrite) { // custom metadata already written, don't overwrite + // LOG_ERROR("Custom metadata already written, use overwrite flag"); + // return ZarrStatusCode_WillNotOverwrite; + // } + // + // if (!custom_metadata_sink_) { + // LOG_ERROR("Custom metadata sink not found"); + // return ZarrStatusCode_InternalError; + // } + // + // const auto metadata_json = nlohmann::json::parse(custom_metadata, + // nullptr, // callback + // false, // allow exceptions + // true // ignore comments + // ); + // + // const auto metadata_str = metadata_json.dump(4); + // std::span data{ reinterpret_cast(metadata_str.data()), + // metadata_str.size() }; + // if (!custom_metadata_sink_->write(0, data)) { + // LOG_ERROR("Error writing custom metadata"); + // return ZarrStatusCode_IOError; + // } return ZarrStatusCode_Success; } @@ -1168,8 +1171,23 @@ ZarrStream_s::configure_array_(const ZarrArraySettings* settings, ZarrOutputArray output_node{ .output_key = config->node_key, .frame_buffer_offset = 0 }; try { - output_node.array = zarr::make_array( - config, thread_pool_, file_handle_pool_, s3_connection_pool_); + const bool multiscale = + config->node_key.empty() || config->downsampling_method.has_value(); + const bool s3 = bucket_name.has_value(); + + if (multiscale && s3) { + output_node.array = zarr::make_array( + config, thread_pool_, s3_connection_pool_); + } else if (multiscale) { + output_node.array = zarr::make_array( + config, thread_pool_, file_handle_pool_); + } else if (s3) { + output_node.array = zarr::make_array( + config, thread_pool_, s3_connection_pool_); + } else { + output_node.array = zarr::make_array( + config, thread_pool_, file_handle_pool_); + } } catch (const std::exception& exc) { set_error_(exc.what()); } @@ -1414,75 +1432,75 @@ ZarrStream_s::create_store_(bool overwrite) bool ZarrStream_s::write_intermediate_metadata_() { - std::optional bucket_name; - if (s3_settings_) { - bucket_name = s3_settings_->bucket_name; - } - - const nlohmann::json group_metadata = nlohmann::json({ - { "zarr_format", 3 }, - { "consolidated_metadata", nullptr }, - { "node_type", "group" }, - { "attributes", nlohmann::json::object() }, - }); - const std::string metadata_key = "zarr.json"; - std::string metadata_str; - - for (const auto& parent_group_key : intermediate_group_paths_) { - const std::string relative_path = - (parent_group_key.empty() ? "" : parent_group_key); - - if (auto pit = plates_.find(relative_path); // is it a plate? - pit != plates_.end()) { - const auto& plate = pit->second; - nlohmann::json plate_metadata( - group_metadata); // make a copy to modify - - // not supported for Zarr V2 / NGFF 0.4 - plate_metadata["attributes"]["ome"] = { - { "version", "0.5" }, - { "plate", plate.to_json() }, - }; - - metadata_str = plate_metadata.dump(4); - } else if (auto wit = wells_.find(relative_path); // is it a well? - wit != wells_.end()) { - const auto& well = wit->second; - nlohmann::json well_metadata( - group_metadata); // make a copy to modify - - // not supported for Zarr V2 / NGFF 0.4 - well_metadata["attributes"]["ome"] = { - { "version", "0.5" }, - { "well", well.to_json() }, - }; - - metadata_str = well_metadata.dump(4); - } else { // generic group - metadata_str = group_metadata.dump(4); - } - - ConstByteSpan metadata_span( - reinterpret_cast(metadata_str.data()), - metadata_str.size()); - - const std::string sink_path = - store_path_ + "/" + relative_path + "/" + metadata_key; - std::unique_ptr metadata_sink; - if (is_s3_acquisition_()) { - metadata_sink = zarr::make_s3_sink( - bucket_name.value(), sink_path, s3_connection_pool_); - } else { - metadata_sink = zarr::make_file_sink(sink_path, file_handle_pool_); - } - - if (!metadata_sink->write(0, metadata_span) || - !zarr::finalize_sink(std::move(metadata_sink))) { - set_error_("Failed to write intermediate metadata for group '" + - parent_group_key + "'"); - return false; - } - } + // std::optional bucket_name; + // if (s3_settings_) { + // bucket_name = s3_settings_->bucket_name; + // } + // + // const nlohmann::json group_metadata = nlohmann::json({ + // { "zarr_format", 3 }, + // { "consolidated_metadata", nullptr }, + // { "node_type", "group" }, + // { "attributes", nlohmann::json::object() }, + // }); + // const std::string metadata_key = "zarr.json"; + // std::string metadata_str; + // + // for (const auto& parent_group_key : intermediate_group_paths_) { + // const std::string relative_path = + // (parent_group_key.empty() ? "" : parent_group_key); + // + // if (auto pit = plates_.find(relative_path); // is it a plate? + // pit != plates_.end()) { + // const auto& plate = pit->second; + // nlohmann::json plate_metadata( + // group_metadata); // make a copy to modify + // + // // not supported for Zarr V2 / NGFF 0.4 + // plate_metadata["attributes"]["ome"] = { + // { "version", "0.5" }, + // { "plate", plate.to_json() }, + // }; + // + // metadata_str = plate_metadata.dump(4); + // } else if (auto wit = wells_.find(relative_path); // is it a well? + // wit != wells_.end()) { + // const auto& well = wit->second; + // nlohmann::json well_metadata( + // group_metadata); // make a copy to modify + // + // // not supported for Zarr V2 / NGFF 0.4 + // well_metadata["attributes"]["ome"] = { + // { "version", "0.5" }, + // { "well", well.to_json() }, + // }; + // + // metadata_str = well_metadata.dump(4); + // } else { // generic group + // metadata_str = group_metadata.dump(4); + // } + // + // ConstByteSpan metadata_span( + // reinterpret_cast(metadata_str.data()), + // metadata_str.size()); + // + // const std::string sink_path = + // store_path_ + "/" + relative_path + "/" + metadata_key; + // std::unique_ptr metadata_sink; + // if (is_s3_acquisition_()) { + // metadata_sink = zarr::make_s3_sink( + // bucket_name.value(), sink_path, s3_connection_pool_); + // } else { + // metadata_sink = zarr::make_file_sink(sink_path, file_handle_pool_); + // } + // + // if (!metadata_sink->write(0, metadata_span) || + // !zarr::finalize_sink(std::move(metadata_sink))) { + // set_error_("Failed to write intermediate metadata for group '" + + // parent_group_key + "'"); + // return false; + // } + // } return true; } @@ -1648,11 +1666,11 @@ finalize_stream(struct ZarrStream_s* stream) // thread stream->thread_pool_->await_stop(); - if (stream->custom_metadata_sink_ && - !zarr::finalize_sink(std::move(stream->custom_metadata_sink_))) { - LOG_ERROR( - "Error finalizing Zarr stream. Failed to write custom metadata"); - } + // if (stream->custom_metadata_sink_ && + // !zarr::finalize_sink(std::move(stream->custom_metadata_sink_))) { + // LOG_ERROR( + // "Error finalizing Zarr stream. Failed to write custom metadata"); + // } for (auto& [key, output] : stream->output_arrays_) { if (!zarr::finalize_array(std::move(output.array))) { diff --git a/src/streaming/zarr.stream.hh b/src/streaming/zarr.stream.hh index 40d2fd0d..86e6365d 100644 --- a/src/streaming/zarr.stream.hh +++ b/src/streaming/zarr.stream.hh @@ -1,26 +1,18 @@ #pragma once -#include "array.hh" +#include "array.base.hh" #include "array.dimensions.hh" -#include "definitions.hh" -#include "downsampler.hh" #include "file.handle.hh" #include "frame.queue.hh" #include "locked.buffer.hh" -#include "multiscale.array.hh" #include "plate.hh" #include "s3.connection.hh" -#include "sink.hh" #include "thread.pool.hh" -#include - #include -#include // size_t #include // unique_ptr #include #include -#include #include #include @@ -87,7 +79,7 @@ struct ZarrStream_s std::shared_ptr s3_connection_pool_; std::shared_ptr file_handle_pool_; - std::unique_ptr custom_metadata_sink_; + // std::unique_ptr custom_metadata_sink_; bool is_s3_acquisition_() const; diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt index 9f84209c..45853fcb 100644 --- a/tests/unit-tests/CMakeLists.txt +++ b/tests/unit-tests/CMakeLists.txt @@ -9,16 +9,12 @@ set(tests array-dimensions-shard-index-for-chunk array-dimensions-shard-internal-index thread-pool-push-to-job-queue - make-dirs - construct-data-paths s3-connection-bucket-exists s3-connection-object-exists-check-false-positives s3-connection-put-object s3-connection-upload-multipart-object - file-sink-write s3-sink-write s3-sink-write-multipart - make-data-sinks array-write-even array-write-ragged-append-dim array-write-ragged-internal-dim diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index f60cc442..2ee0137e 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -1,10 +1,11 @@ -#include "array.hh" +#include "fs.array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" #include #include +#include namespace fs = std::filesystem; @@ -142,11 +143,10 @@ main() level_of_detail); { - auto writer = std::make_unique( + auto writer = std::make_unique( config, thread_pool, - std::make_shared(), - nullptr); + std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); diff --git a/tests/unit-tests/array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp index f6d1d8f4..09007a64 100644 --- a/tests/unit-tests/array-write-ragged-append-dim.cpp +++ b/tests/unit-tests/array-write-ragged-append-dim.cpp @@ -1,10 +1,11 @@ -#include "array.hh" +#include "fs.array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" #include #include +#include namespace fs = std::filesystem; @@ -113,11 +114,8 @@ main() 4); { - auto writer = std::make_unique( - config, - thread_pool, - std::make_shared(), - nullptr); + auto writer = std::make_unique( + config, thread_pool, std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); diff --git a/tests/unit-tests/array-write-ragged-internal-dim.cpp b/tests/unit-tests/array-write-ragged-internal-dim.cpp index 9c9d52f7..805aaccf 100644 --- a/tests/unit-tests/array-write-ragged-internal-dim.cpp +++ b/tests/unit-tests/array-write-ragged-internal-dim.cpp @@ -1,10 +1,11 @@ -#include "array.hh" +#include "fs.array.hh" #include "unit.test.macros.hh" #include "zarr.common.hh" #include #include +#include namespace fs = std::filesystem; @@ -130,11 +131,10 @@ main() 5); { - auto writer = std::make_unique( + auto writer = std::make_unique( config, thread_pool, - std::make_shared(), - nullptr); + std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); diff --git a/tests/unit-tests/construct-data-paths.cpp b/tests/unit-tests/construct-data-paths.cpp deleted file mode 100644 index d340a557..00000000 --- a/tests/unit-tests/construct-data-paths.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "unit.test.macros.hh" -#include "sink.hh" -#include "array.dimensions.hh" - -#include -#include -#include - -namespace { -auto -create_parts_fun(size_t parts) -{ - return [parts](const ZarrDimension&) { return parts; }; -} -} // namespace - -int -main() -{ - int retval = 1; - - try { - std::vector dims{ - { "time", ZarrDimensionType_Time, 50, 16, 2 }, - { "height", ZarrDimensionType_Space, 100, 32, 2 }, - { "width", ZarrDimensionType_Space, 100, 32, 2 } - }; - ArrayDimensions dimensions(std::move(dims), ZarrDataType_uint8); - { - const auto parts_fun = create_parts_fun(2); - const auto paths = - zarr::construct_data_paths("", dimensions, parts_fun); - - EXPECT_EQ(int, paths.size(), 4); - EXPECT_STR_EQ(paths[0].c_str(), "0/0"); - EXPECT_STR_EQ(paths[1].c_str(), "0/1"); - EXPECT_STR_EQ(paths[2].c_str(), "1/0"); - EXPECT_STR_EQ(paths[3].c_str(), "1/1"); - } - - { - const auto parts_fun = create_parts_fun(3); - const auto paths = - zarr::construct_data_paths("", dimensions, parts_fun); - - EXPECT_EQ(int, paths.size(), 9); - EXPECT_STR_EQ(paths[0].c_str(), "0/0"); - EXPECT_STR_EQ(paths[1].c_str(), "0/1"); - EXPECT_STR_EQ(paths[2].c_str(), "0/2"); - EXPECT_STR_EQ(paths[3].c_str(), "1/0"); - EXPECT_STR_EQ(paths[4].c_str(), "1/1"); - EXPECT_STR_EQ(paths[5].c_str(), "1/2"); - EXPECT_STR_EQ(paths[6].c_str(), "2/0"); - EXPECT_STR_EQ(paths[7].c_str(), "2/1"); - EXPECT_STR_EQ(paths[8].c_str(), "2/2"); - } - - retval = 0; - } catch (const std::exception& e) { - LOG_ERROR("Test failed: ", e.what()); - throw; - } - - return retval; -} diff --git a/tests/unit-tests/file-sink-write.cpp b/tests/unit-tests/file-sink-write.cpp deleted file mode 100644 index a06d75ca..00000000 --- a/tests/unit-tests/file-sink-write.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "file.sink.hh" -#include "unit.test.macros.hh" - -#include -#include -#include - -namespace fs = std::filesystem; - -int -main() -{ - int retval = 0; - fs::path tmp_path = fs::temp_directory_path() / TEST; - - try { - CHECK(!fs::exists(tmp_path)); - { - char str[] = "Hello, Acquire!"; - auto sink = std::make_unique( - tmp_path.string(), std::make_shared()); - - std::span data = { reinterpret_cast(str), - sizeof(str) - 1 }; - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); - } - - // The file tmp_path should now contain the string "Hello, world!\n". - CHECK(fs::exists(tmp_path)); - - std::ifstream ifs(tmp_path); - CHECK(ifs.is_open()); - - std::string contents; - while (!ifs.eof()) { - std::getline(ifs, contents); - } - ifs.close(); - - EXPECT_STR_EQ(contents.c_str(), "Hello, Acquire!"); - } catch (const std::exception& e) { - LOG_ERROR("Caught exception: ", e.what()); - retval = 1; - } - - std::error_code ec; - if (!fs::remove(tmp_path, ec)) { - LOG_ERROR("Failed to remove file: ", ec.message()); - retval = 1; - } - - return retval; -} \ No newline at end of file diff --git a/tests/unit-tests/make-data-sinks.cpp b/tests/unit-tests/make-data-sinks.cpp deleted file mode 100644 index e674ff2c..00000000 --- a/tests/unit-tests/make-data-sinks.cpp +++ /dev/null @@ -1,292 +0,0 @@ -#include "sink.hh" -#include "s3.connection.hh" -#include "zarr.common.hh" -#include "acquire.zarr.h" -#include "unit.test.macros.hh" - -#include -#include - -namespace fs = std::filesystem; - -namespace { -const std::string test_dir = TEST "-data"; - -bool -get_settings(zarr::S3Settings& settings) -{ - char* env = nullptr; - if (!(env = std::getenv("ZARR_S3_ENDPOINT"))) { - LOG_ERROR("ZARR_S3_ENDPOINT not set."); - return false; - } - settings.endpoint = env; - - if (!(env = std::getenv("ZARR_S3_BUCKET_NAME"))) { - LOG_ERROR("ZARR_S3_BUCKET_NAME not set."); - return false; - } - settings.bucket_name = env; - - env = std::getenv("ZARR_S3_REGION"); - if (env) { - settings.region = env; - } - - return true; -} -} // namespace - -void -make_chunk_file_sinks(std::shared_ptr thread_pool, - const ArrayDimensions& dimensions) -{ - // create the sinks, then let them go out of scope to close the handles - { - std::vector> sinks; - CHECK( - zarr::make_data_file_sinks(test_dir, - dimensions, - zarr::chunks_along_dimension, - thread_pool, - std::make_shared(), - sinks)); - - std::vector data(2, 0); - for (auto& sink : sinks) { - CHECK(sink); - // we need to write some data to the sink to ensure it is created - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); - } - } - - const auto chunks_in_y = - zarr::chunks_along_dimension(dimensions.height_dim()); - const auto chunks_in_x = - zarr::chunks_along_dimension(dimensions.width_dim()); - - const fs::path base_path(test_dir); - for (auto i = 0; i < chunks_in_y; ++i) { - const fs::path y_dir = base_path / std::to_string(i); - - for (auto j = 0; j < chunks_in_x; ++j) { - const fs::path x_file = y_dir / std::to_string(j); - CHECK(fs::is_regular_file(x_file)); - - // cleanup - fs::remove(x_file); - } - CHECK(!fs::is_regular_file(y_dir / std::to_string(chunks_in_x))); - fs::remove(y_dir); - } - CHECK(!fs::is_directory(base_path / std::to_string(chunks_in_y))); -} - -void -make_chunk_s3_sinks(std::shared_ptr thread_pool, - std::shared_ptr connection_pool, - const std::string& bucket_name, - const ArrayDimensions& dimensions) -{ - // create the sinks, then let them go out of scope to close the handles - { - char data_[] = { 0, 0 }; - std::span data(reinterpret_cast(data_), sizeof(data_)); - std::vector> sinks; - CHECK(make_data_s3_sinks(bucket_name, - test_dir, - dimensions, - zarr::chunks_along_dimension, - connection_pool, - sinks)); - - for (auto& sink : sinks) { - CHECK(sink); - // we need to write some data to the sink to ensure it is created - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); - } - } - - const auto chunks_in_y = - zarr::chunks_along_dimension(dimensions.height_dim()); - const auto chunks_in_x = - zarr::chunks_along_dimension(dimensions.width_dim()); - - auto conn = connection_pool->get_connection(); - - const std::string base_path(test_dir); - for (auto i = 0; i < chunks_in_y; ++i) { - const std::string y_dir = base_path + "/" + std::to_string(i); - - for (auto j = 0; j < chunks_in_x; ++j) { - const std::string x_file = y_dir + "/" + std::to_string(j); - CHECK(conn->object_exists(bucket_name, x_file)); - - // cleanup - CHECK(conn->delete_object(bucket_name, x_file)); - } - CHECK(!conn->object_exists(bucket_name, - y_dir + "/" + std::to_string(chunks_in_x))); - CHECK(conn->delete_object(bucket_name, y_dir)); - } - CHECK(!conn->object_exists(bucket_name, - base_path + "/" + std::to_string(chunks_in_y))); - CHECK(conn->delete_object(bucket_name, base_path)); -} - -void -make_shard_file_sinks(std::shared_ptr thread_pool, - const ArrayDimensions& dimensions) -{ - // create the sinks, then let them go out of scope to close the handles - { - std::vector> sinks; - CHECK(make_data_file_sinks(test_dir, - dimensions, - zarr::shards_along_dimension, - thread_pool, - std::make_shared(), - sinks)); - - std::vector data(2, 0); - for (auto& sink : sinks) { - CHECK(sink); - // we need to write some data to the sink to ensure it is created - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); - } - } - - const auto shards_in_y = - zarr::shards_along_dimension(dimensions.height_dim()); - const auto shards_in_x = - zarr::shards_along_dimension(dimensions.width_dim()); - - const fs::path base_path(test_dir); - for (auto i = 0; i < shards_in_y; ++i) { - const fs::path y_dir = base_path / std::to_string(i); - - for (auto j = 0; j < shards_in_x; ++j) { - const fs::path x_file = y_dir / std::to_string(j); - CHECK(fs::is_regular_file(x_file)); - - // cleanup - fs::remove(x_file); - } - CHECK(!fs::is_regular_file(y_dir / std::to_string(shards_in_x))); - fs::remove(y_dir); - } - CHECK(!fs::is_directory(base_path / std::to_string(shards_in_y))); -} - -void -make_shard_s3_sinks(std::shared_ptr thread_pool, - std::shared_ptr connection_pool, - const std::string& bucket_name, - const ArrayDimensions& dimensions) -{ - // create the sinks, then let them go out of scope to close the handles - { - char data_[] = { 0, 0 }; - std::span data(reinterpret_cast(data_), sizeof(data_)); - std::vector> sinks; - CHECK(make_data_s3_sinks(bucket_name, - test_dir, - dimensions, - zarr::shards_along_dimension, - connection_pool, - sinks)); - - for (auto& sink : sinks) { - CHECK(sink); - // we need to write some data to the sink to ensure it is created - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); - } - } - - const auto shards_in_y = - zarr::shards_along_dimension(dimensions.height_dim()); - const auto shards_in_x = - zarr::shards_along_dimension(dimensions.width_dim()); - - auto conn = connection_pool->get_connection(); - - const std::string base_path(test_dir); - for (auto i = 0; i < shards_in_y; ++i) { - const std::string y_dir = base_path + "/" + std::to_string(i); - - for (auto j = 0; j < shards_in_x; ++j) { - const std::string x_file = y_dir + "/" + std::to_string(j); - CHECK(conn->object_exists(bucket_name, x_file)); - - // cleanup - CHECK(conn->delete_object(bucket_name, x_file)); - } - CHECK(!conn->object_exists(bucket_name, - y_dir + "/" + std::to_string(shards_in_x))); - CHECK(conn->delete_object(bucket_name, y_dir)); - } - CHECK(!conn->object_exists(bucket_name, - base_path + "/" + std::to_string(shards_in_y))); - CHECK(conn->delete_object(bucket_name, base_path)); -} - -int -main() -{ - Logger::set_log_level(LogLevel_Debug); - - std::vector dims; - dims.emplace_back("z", - ZarrDimensionType_Space, - 0, - 3, // 3 planes per chunk - 1); // 1 chunk per shard (3 planes per shard) - dims.emplace_back("y", - ZarrDimensionType_Space, - 4, - 2, // 2 rows per chunk, 2 chunks - 2); // 2 chunks per shard (4 rows per shard, 1 shard) - dims.emplace_back("x", - ZarrDimensionType_Space, - 12, - 3, // 3 columns per chunk, 4 chunks - 2); // 2 chunks per shard (6 columns per shard, 2 shards) - ArrayDimensions dimensions(std::move(dims), ZarrDataType_int8); - - auto thread_pool = std::make_shared( - std::thread::hardware_concurrency(), - [](const std::string& err) { LOG_ERROR("Failed: ", err.c_str()); }); - - try { - make_chunk_file_sinks(thread_pool, dimensions); - make_shard_file_sinks(thread_pool, dimensions); - } catch (const std::exception& e) { - LOG_ERROR("Failed: ", e.what()); - return 1; - } - - zarr::S3Settings settings; - if (!get_settings(settings)) { - LOG_WARNING("Failed to get credentials. Skipping S3 portion of test."); - return 0; - } - - auto connection_pool = - std::make_shared(4, settings); - - try { - make_chunk_s3_sinks( - thread_pool, connection_pool, settings.bucket_name, dimensions); - make_shard_s3_sinks( - thread_pool, connection_pool, settings.bucket_name, dimensions); - } catch (const std::exception& e) { - LOG_ERROR("Failed: ", e.what()); - return 1; - } - - return 0; -} \ No newline at end of file diff --git a/tests/unit-tests/s3-sink-write-multipart.cpp b/tests/unit-tests/s3-sink-write-multipart.cpp index d5da394f..7c0e254b 100644 --- a/tests/unit-tests/s3-sink-write-multipart.cpp +++ b/tests/unit-tests/s3-sink-write-multipart.cpp @@ -1,4 +1,4 @@ -#include "s3.sink.hh" +#include "s3.object.hh" #include "unit.test.macros.hh" #include @@ -57,9 +57,9 @@ main() std::vector data((5 << 20) + 1, 0); { auto sink = - std::make_unique(settings.bucket_name, object_name, pool); - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); + std::make_unique(settings.bucket_name, object_name, pool); + CHECK(sink->write(data, 0)); + CHECK(sink->close()); } conn = pool->get_connection(); diff --git a/tests/unit-tests/s3-sink-write.cpp b/tests/unit-tests/s3-sink-write.cpp index acac0cdd..9a7f3b89 100644 --- a/tests/unit-tests/s3-sink-write.cpp +++ b/tests/unit-tests/s3-sink-write.cpp @@ -1,4 +1,4 @@ -#include "s3.sink.hh" +#include "s3.object.hh" #include "unit.test.macros.hh" #include @@ -55,12 +55,12 @@ main() { char str[] = "Hello, Acquire!"; - auto sink = std::make_unique( + auto sink = std::make_unique( settings.bucket_name, object_name, pool); std::span data{ reinterpret_cast(str), sizeof(str) - 1 }; - CHECK(sink->write(0, data)); - CHECK(zarr::finalize_sink(std::move(sink))); + CHECK(sink->write(data, 0)); + CHECK(sink->close()); } conn = pool->get_connection(); From 064b4509d89d87b4392661eef66ea3c8f30158e5 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 13:32:25 +0200 Subject: [PATCH 16/38] (wip): tests ok (slow) --- .github/workflows/test.yml | 7 +- python/acquire-zarr-py.cpp | 2 +- python/tests/test_stream.py | 22 +- src/streaming/array.base.hh | 2 +- src/streaming/array.cpp | 21 +- src/streaming/array.dimensions.cpp | 9 +- src/streaming/file.handle.cpp | 16 +- src/streaming/file.handle.hh | 1 + src/streaming/fs.array.cpp | 20 +- src/streaming/fs.multiscale.array.cpp | 11 +- src/streaming/fs.storage.cpp | 8 +- src/streaming/fs.storage.hh | 18 +- src/streaming/s3.array.cpp | 22 +- src/streaming/s3.multiscale.array.cpp | 12 +- src/streaming/s3.storage.cpp | 6 +- src/streaming/s3.storage.hh | 42 +-- src/streaming/sink.cpp | 145 ---------- src/streaming/zarr.common.cpp | 3 +- src/streaming/zarr.stream.cpp | 249 ++++++++---------- src/streaming/zarr.stream.hh | 15 +- ...array-dimensions-chunk-internal-offset.cpp | 10 +- .../array-dimensions-chunk-lattice-index.cpp | 10 +- .../array-dimensions-tile-group-offset.cpp | 10 +- tests/unit-tests/array-write-even.cpp | 49 ++-- 24 files changed, 307 insertions(+), 403 deletions(-) delete mode 100644 src/streaming/sink.cpp diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a70f39ce..63887d86 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -165,7 +165,12 @@ jobs: run: python -m pip install ".[testing]" - name: Test Python - run: python -m pytest -v -k test_stream_data_to_s3 + run: | + echo "AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID" >>.env + echo "AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" >>.env + echo "ZARR_S3_ENDPOINT=$ZARR_S3_ENDPOINT" >>.env + echo "ZARR_S3_BUCKET_NAME=$ZARR_S3_BUCKET_NAME" >>.env + python -m pytest -s -k test_stream_data_to_s3 test-python: diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp index a1113a31..73ea1d5b 100644 --- a/python/acquire-zarr-py.cpp +++ b/python/acquire-zarr-py.cpp @@ -888,7 +888,7 @@ class PyZarrStreamSettings settings_.overwrite = static_cast(overwrite_); if (s3_settings_) { - *(settings_.s3_settings) = *(s3_settings_->settings()); + settings_.s3_settings = s3_settings_->settings(); } // construct array lifetime props and set up arrays diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py index c8d7e108..de3bc04c 100644 --- a/python/tests/test_stream.py +++ b/python/tests/test_stream.py @@ -402,13 +402,16 @@ def test_stream_data_to_filesystem( assert np.array_equal(array[i, :, :], data[i, :, :]) metadata = array.metadata + sharding_codec = metadata.codecs[0] if compression_codec is not None: cname = ( zblosc.BloscCname.lz4 if compression_codec == CompressionCodec.BLOSC_LZ4 else zblosc.BloscCname.zstd ) - blosc_codec = metadata.codecs[0].codecs[1] + + assert len(sharding_codec.codecs) == 2 + blosc_codec = sharding_codec.codecs[1] assert blosc_codec.cname == cname assert blosc_codec.clevel == 1 assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle @@ -420,7 +423,7 @@ def test_stream_data_to_filesystem( store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" ).stat().st_size <= shard_size_bytes else: - assert len(metadata.codecs[0].codecs) == 1 + assert len(sharding_codec.codecs) == 1 assert ( store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" @@ -456,12 +459,12 @@ def test_stream_data_to_s3( pytest.skip("S3 settings not set") settings.store_path = f"{request.node.name}.zarr".replace("[", "").replace( - "]", "" + "]", "_" ) settings.s3 = s3_settings - settings.data_type = np.uint16 + settings.arrays[0].data_type = np.uint16 if compression_codec is not None: - settings.compression = CompressionSettings( + settings.arrays[0].compression = CompressionSettings( compressor=Compressor.BLOSC1, codec=compression_codec, level=1, @@ -501,18 +504,23 @@ def test_stream_data_to_s3( assert np.array_equal(array[i, :, :], data[i, :, :]) metadata = array.metadata + assert len(metadata.codecs) == 1 # sharding codec + sharding_codec = metadata.codecs[0] + if compression_codec is not None: cname = ( zblosc.BloscCname.lz4 if compression_codec == CompressionCodec.BLOSC_LZ4 else zblosc.BloscCname.zstd ) - blosc_codec = metadata.codecs[0].codecs[1] + assert len(sharding_codec.codecs) == 2 + + blosc_codec = sharding_codec.codecs[1] assert blosc_codec.cname == cname assert blosc_codec.clevel == 1 assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle else: - assert len(metadata.codecs[0].codecs) == 1 + assert len(sharding_codec.codecs) == 1 # bytes codec # cleanup s3 = s3fs.S3FileSystem( diff --git a/src/streaming/array.base.hh b/src/streaming/array.base.hh index 14d35e34..73bd2473 100644 --- a/src/streaming/array.base.hh +++ b/src/streaming/array.base.hh @@ -79,7 +79,7 @@ class ArrayBase std::shared_ptr config_; std::shared_ptr thread_pool_; - std::string metadata_str_; + std::string last_written_metadata_; std::string node_path_() const; [[nodiscard]] virtual bool make_metadata_(std::string& metadata_str) = 0; diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index baea640a..96d987a2 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -248,7 +248,7 @@ zarr::Array::make_metadata_(std::string& metadata_str) metadata["codecs"] = codecs; - metadata_str_ = metadata.dump(4); + metadata_str = metadata.dump(4); return true; } @@ -288,7 +288,7 @@ zarr::Array::make_data_paths_() paths_queue.emplace(data_root_); // create intermediate paths - for (auto i = 1; // skip the last dimension + for (auto i = 1; // skip the last dimension i < dimensions->ndims() - 1; // skip the x dimension ++i) { const auto& dim = dimensions->at(i); @@ -305,11 +305,11 @@ zarr::Array::make_data_paths_() paths_queue.push(path + (path.empty() ? kstr : "/" + kstr)); } } - } + } // create final paths data_paths_.reserve(paths_queue.size() * - shards_along_dimension(dimensions->width_dim())); + shards_along_dimension(dimensions->width_dim())); { const auto& dim = dimensions->width_dim(); const auto n_parts = shards_along_dimension(dim); @@ -520,12 +520,15 @@ zarr::Array::compress_and_flush_data_() return false; } - if (const auto should_write_table = is_closing_ || should_rollover_(); - should_write_table && !flush_tables_()) { - LOG_ERROR("Failed to flush shard tables"); - return false; - } else if (!should_write_table) { + if (is_closing_ || should_rollover_()) { // flush table + if (!flush_tables_()) { + LOG_ERROR("Failed to flush shard tables"); + return false; + } + current_layer_ = 0; + } else { ++current_layer_; + CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); } return true; diff --git a/src/streaming/array.dimensions.cpp b/src/streaming/array.dimensions.cpp index c7770d30..6f9c9806 100644 --- a/src/streaming/array.dimensions.cpp +++ b/src/streaming/array.dimensions.cpp @@ -11,12 +11,13 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, , bytes_per_chunk_(zarr::bytes_of_type(dtype)) , number_of_chunks_in_memory_(1) { - EXPECT(dims_.size() > 2, "Array must have at least three dimensions."); + const auto ndims = dims_.size(); + EXPECT(ndims > 2, "Array must have at least three dimensions."); frames_before_flush_ = final_dim().chunk_size_px * final_dim().shard_size_chunks; - for (auto i = 0; i < dims_.size(); ++i) { + for (auto i = 0; i < ndims; ++i) { const auto& dim = dims_[i]; bytes_per_chunk_ *= dim.chunk_size_px; chunks_per_shard_ *= dim.shard_size_chunks; @@ -24,7 +25,9 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, if (i > 0) { number_of_chunks_in_memory_ *= zarr::chunks_along_dimension(dim); number_of_shards_ *= zarr::shards_along_dimension(dim); - frames_before_flush_ *= dim.array_size_px; + if (i < ndims - 2) { + frames_before_flush_ *= dim.array_size_px; + } } } diff --git a/src/streaming/file.handle.cpp b/src/streaming/file.handle.cpp index 75551627..8ddcbd08 100644 --- a/src/streaming/file.handle.cpp +++ b/src/streaming/file.handle.cpp @@ -51,7 +51,15 @@ zarr::FileHandlePool::get_handle(const std::string& filename, void* flags) { std::unique_lock lock(mutex_); if (const auto it = handle_map_.find(filename); it != handle_map_.end()) { - return it->second->second.lock(); + if (auto handle = it->second->second.lock()) { + // move to front of list + handles_.splice(handles_.begin(), handles_, it->second); + return handle; + } + + // expired, remove from list and map + handles_.erase(it->second); + handle_map_.erase(it); } cv_.wait(lock, [&] { return handles_.size() < max_active_handles_; }); @@ -63,7 +71,7 @@ zarr::FileHandlePool::get_handle(const std::string& filename, void* flags) EXPECT(handle != nullptr, "Failed to create file handle for " + filename); handles_.emplace_front(filename, handle); - handle_map_[filename] = handles_.begin(); + handle_map_.emplace(filename, handles_.begin()); return handle; } @@ -83,10 +91,10 @@ bool zarr::FileHandlePool::evict_idle_handle_() { bool evicted = false; - for (auto it = handles_.begin(); it != handles_.end(); ++it) { + for (auto it = handles_.begin(); it != handles_.end();) { if (it->second.expired()) { handle_map_.erase(it->first); - handles_.erase(it); + it = handles_.erase(it); evicted = true; } } diff --git a/src/streaming/file.handle.hh b/src/streaming/file.handle.hh index e3281008..acbf811e 100644 --- a/src/streaming/file.handle.hh +++ b/src/streaming/file.handle.hh @@ -5,6 +5,7 @@ #include // for std::unique_ptr #include #include +#include namespace zarr { /** diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index 48d7b59c..d49147de 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -97,9 +97,18 @@ zarr::FSArray::write_metadata_() LOG_ERROR("Failed to make metadata."); return false; } + + if (last_written_metadata_ == metadata) { + return true; // no changes + } const std::string path = node_path_() + "/zarr.json"; - return write_string_(path, metadata, 0); + bool success; + if ((success = write_string(path, metadata, 0))) { + last_written_metadata_ = metadata; + } + + return success; } bool @@ -139,12 +148,12 @@ zarr::FSArray::flush_data_() try { // consolidate chunks in shard - const auto shard_data = consolidate_chunks_(shard_idx); - if (!write_binary_(data_path, shard_data, *file_offset)) { + if (const auto shard_data = consolidate_chunks_(shard_idx); + !write_binary(data_path, shard_data, *file_offset)) { err = "Failed to write shard at path " + data_path; success = false; } else { - *file_offset = shard_data.size(); + *file_offset += shard_data.size(); } } catch (const std::exception& exc) { err = "Failed to flush data: " + std::string(exc.what()); @@ -201,7 +210,7 @@ zarr::FSArray::flush_tables_() std::string data_path = data_paths_[shard_idx]; - if (!write_binary_(data_path, table, *file_offset)) { + if (!write_binary(data_path, table, *file_offset)) { LOG_ERROR("Failed to write table and checksum to shard ", shard_idx, " at path ", @@ -216,7 +225,6 @@ zarr::FSArray::flush_tables_() std::ranges::fill(table, std::numeric_limits::max()); } std::ranges::fill(shard_file_offsets_, 0); - current_layer_ = 0; } return true; diff --git a/src/streaming/fs.multiscale.array.cpp b/src/streaming/fs.multiscale.array.cpp index 814f0302..539b7d01 100644 --- a/src/streaming/fs.multiscale.array.cpp +++ b/src/streaming/fs.multiscale.array.cpp @@ -23,9 +23,18 @@ zarr::FSMultiscaleArray::write_metadata_() LOG_ERROR("Failed to make metadata."); return false; } + + if (last_written_metadata_ == metadata) { + return true; // no changes + } const std::string path = node_path_() + "/zarr.json"; - return write_string_(path, metadata, 0); + bool success; + if ((success = write_string(path, metadata, 0))) { + last_written_metadata_ = metadata; + } + + return success; } bool diff --git a/src/streaming/fs.storage.cpp b/src/streaming/fs.storage.cpp index 556962b9..c3d749b9 100644 --- a/src/streaming/fs.storage.cpp +++ b/src/streaming/fs.storage.cpp @@ -18,12 +18,12 @@ zarr::FSStorage::FSStorage(std::shared_ptr file_handle_pool) } bool -zarr::FSStorage::write_binary_(const std::string& path, +zarr::FSStorage::write_binary(const std::string& path, const std::vector& data, size_t offset) const { void* flags = make_flags(); - auto handle = file_handle_pool_->get_handle(path, flags); + const auto handle = file_handle_pool_->get_handle(path, flags); destroy_flags(flags); if (handle == nullptr) { @@ -40,12 +40,12 @@ zarr::FSStorage::write_binary_(const std::string& path, } bool -zarr::FSStorage::write_string_(const std::string& path, +zarr::FSStorage::write_string(const std::string& path, const std::string& data, size_t offset) const { void* flags = make_flags(); - auto handle = file_handle_pool_->get_handle(path, flags); + const auto handle = file_handle_pool_->get_handle(path, flags); destroy_flags(flags); if (handle == nullptr) { diff --git a/src/streaming/fs.storage.hh b/src/streaming/fs.storage.hh index 74c29de6..1168694b 100644 --- a/src/streaming/fs.storage.hh +++ b/src/streaming/fs.storage.hh @@ -12,9 +12,6 @@ class FSStorage explicit FSStorage(std::shared_ptr file_handle_pool); virtual ~FSStorage() = default; - protected: - std::shared_ptr file_handle_pool_; - /** * @brief Write binary data to a path at the given offset. * @param path The path to write to. @@ -22,9 +19,9 @@ class FSStorage * @param offset The offset to write at. * @return True if the write was successful, false otherwise. */ - [[nodiscard]] bool write_binary_(const std::string& path, - const std::vector& data, - size_t offset) const; + [[nodiscard]] bool write_binary(const std::string& path, + const std::vector& data, + size_t offset) const; /** * @brief Write a string to a path at the given offset. @@ -33,8 +30,11 @@ class FSStorage * @param offset The offset to write at. * @return True if the write was successful, false otherwise. */ - [[nodiscard]] bool write_string_(const std::string& path, - const std::string& data, - size_t offset) const; + [[nodiscard]] bool write_string(const std::string& path, + const std::string& data, + size_t offset) const; + + protected: + std::shared_ptr file_handle_pool_; }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index b33e74de..72eb7ed0 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -22,9 +22,17 @@ zarr::S3Array::write_metadata_() LOG_ERROR("Failed to make metadata."); return false; } - const std::string path = node_path_() + "/zarr.json"; - return write_string_(path, metadata, 0); + if (last_written_metadata_ == metadata) { + return true; // no changes + } + const std::string key = node_path_() + "/zarr.json"; + + bool success; + if ((success = write_string(key, metadata, 0) && finalize_object(key))) { + last_written_metadata_ = metadata; + } + return success; } bool @@ -60,12 +68,12 @@ zarr::S3Array::flush_data_() try { // consolidate chunks in shard - const auto shard_data = consolidate_chunks_(shard_idx); - if (!write_binary_(data_path, shard_data, *file_offset)) { + if (const auto shard_data = consolidate_chunks_(shard_idx); + !write_binary(data_path, shard_data, *file_offset)) { err = "Failed to write shard at path " + data_path; success = false; } else { - *file_offset = shard_data.size(); + *file_offset += shard_data.size(); } } catch (const std::exception& exc) { err = "Failed to flush data: " + std::string(exc.what()); @@ -122,7 +130,7 @@ zarr::S3Array::flush_tables_() std::string data_path = data_paths_[shard_idx]; - if (!write_binary_(data_path, table, *file_offset)) { + if (!write_binary(data_path, table, *file_offset)) { LOG_ERROR("Failed to write table and checksum to shard ", shard_idx, " at path ", @@ -147,7 +155,7 @@ void zarr::S3Array::close_io_streams_() { for (const auto& key : data_paths_) { - s3_objects_.erase(key); + EXPECT(finalize_object(key), "Failed to finalize S3 object at ", key); } data_paths_.clear(); diff --git a/src/streaming/s3.multiscale.array.cpp b/src/streaming/s3.multiscale.array.cpp index 5b862011..d561bdc6 100644 --- a/src/streaming/s3.multiscale.array.cpp +++ b/src/streaming/s3.multiscale.array.cpp @@ -23,9 +23,17 @@ zarr::S3MultiscaleArray::write_metadata_() LOG_ERROR("Failed to make metadata."); return false; } - const std::string path = node_path_() + "/zarr.json"; - return write_string_(path, metadata, 0); + if (last_written_metadata_ == metadata) { + return true; // no changes + } + const std::string key = node_path_() + "/zarr.json"; + + bool success; + if ((success = write_string(key, metadata, 0) && finalize_object(key))) { + last_written_metadata_ = metadata; + } + return success; } bool diff --git a/src/streaming/s3.storage.cpp b/src/streaming/s3.storage.cpp index 57b785d0..1e766f95 100644 --- a/src/streaming/s3.storage.cpp +++ b/src/streaming/s3.storage.cpp @@ -15,7 +15,7 @@ zarr::S3Storage::S3Storage(const std::string& bucket_name, bool zarr::S3Storage::finalize_object(const std::string& path) { - if (auto it = s3_objects_.find(path); it != s3_objects_.end()) { + if (const auto it = s3_objects_.find(path); it != s3_objects_.end()) { if (const auto& s3_object = it->second; s3_object != nullptr) { if (!s3_object->close()) { LOG_ERROR("Failed to finalize S3 object at ", path); @@ -41,7 +41,7 @@ zarr::S3Storage::create_s3_object_(const std::string& key) } bool -zarr::S3Storage::write_binary_(const std::string& key, +zarr::S3Storage::write_binary(const std::string& key, const std::vector& data, size_t offset) { @@ -58,7 +58,7 @@ zarr::S3Storage::write_binary_(const std::string& key, } bool -zarr::S3Storage::write_string_(const std::string& key, +zarr::S3Storage::write_string(const std::string& key, const std::string& data, size_t offset) { diff --git a/src/streaming/s3.storage.hh b/src/streaming/s3.storage.hh index 95e785fa..c903102e 100644 --- a/src/streaming/s3.storage.hh +++ b/src/streaming/s3.storage.hh @@ -12,21 +12,6 @@ class S3Storage std::shared_ptr s3_connection_pool); virtual ~S3Storage() = default; - /** - * @brief Finalize the object at the given path. - * @details This will ensure that any buffered data is flushed and the - * object is properly closed. - * @param path The path of the object to finalize. - * @return True if the object was successfully finalized, otherwise false. - */ - [[nodiscard]] bool finalize_object(const std::string& path); - - protected: - const std::string bucket_name_; - std::shared_ptr s3_connection_pool_; - - void create_s3_object_(const std::string& key); - /** * @brief Write binary data to a path at the given offset. * @param key The path to write to. @@ -34,9 +19,9 @@ class S3Storage * @param offset The offset to write at. * @return True if the write was successful, false otherwise. */ - [[nodiscard]] bool write_binary_(const std::string& key, - const std::vector& data, - size_t offset); + [[nodiscard]] bool write_binary(const std::string& key, + const std::vector& data, + size_t offset); /** * @brief Write a string to a path at the given offset. @@ -45,9 +30,24 @@ class S3Storage * @param offset The offset to write at. * @return True if the write was successful, false otherwise. */ - [[nodiscard]] bool write_string_(const std::string& key, - const std::string& data, - size_t offset); + [[nodiscard]] bool write_string(const std::string& key, + const std::string& data, + size_t offset); + + /** + * @brief Finalize the object at the given path. + * @details This will ensure that any buffered data is flushed and the + * object is properly closed. + * @param path The path of the object to finalize. + * @return True if the object was successfully finalized, otherwise false. + */ + [[nodiscard]] bool finalize_object(const std::string& path); + + protected: + const std::string bucket_name_; + std::shared_ptr s3_connection_pool_; + + void create_s3_object_(const std::string& key); std::unordered_map> s3_objects_; }; diff --git a/src/streaming/sink.cpp b/src/streaming/sink.cpp deleted file mode 100644 index 0d947cec..00000000 --- a/src/streaming/sink.cpp +++ /dev/null @@ -1,145 +0,0 @@ -#include "sink.hh" -#include "file.sink.hh" -#include "s3.sink.hh" -#include "macros.hh" - -#include -#include -#include -#include -#include - -namespace fs = std::filesystem; - -namespace { -bool -bucket_exists(std::string_view bucket_name, - std::shared_ptr connection_pool) -{ - CHECK(!bucket_name.empty()); - EXPECT(connection_pool, "S3 connection pool not provided."); - - auto conn = connection_pool->get_connection(); - bool bucket_exists = conn->bucket_exists(bucket_name); - - connection_pool->return_connection(std::move(conn)); - - return bucket_exists; -} - -bool -make_s3_sinks(std::string_view bucket_name, - const std::vector& object_keys, - std::shared_ptr connection_pool, - std::vector>& sinks) -{ - if (object_keys.empty()) { - return true; - } - - if (bucket_name.empty()) { - LOG_ERROR("Bucket name not provided."); - return false; - } - if (!connection_pool) { - LOG_ERROR("S3 connection pool not provided."); - return false; - } - - const auto n_objects = object_keys.size(); - sinks.resize(n_objects); - for (auto i = 0; i < n_objects; ++i) { - sinks[i] = std::make_unique( - bucket_name, object_keys[i], connection_pool); - } - - return true; -} -} // namespace - -bool -zarr::finalize_sink(std::unique_ptr&& sink) -{ - if (sink == nullptr) { - LOG_INFO("Sink is null. Nothing to finalize."); - return true; - } - - if (!sink->flush_()) { - return false; - } - - sink.reset(); - return true; -} - -std::vector -zarr::construct_data_paths(std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension) -{ - - - return paths_out; -} - -std::unique_ptr -zarr::make_file_sink(std::string_view file_path, - std::shared_ptr file_handle_pool) -{ - if (file_path.starts_with("file://")) { - file_path = file_path.substr(7); - } - - EXPECT(!file_path.empty(), "File path must not be empty."); - - fs::path path(file_path); - EXPECT(!path.empty(), "Invalid file path: ", file_path); - - fs::path parent_path = path.parent_path(); - - if (!fs::is_directory(parent_path)) { - std::error_code ec; - if (!fs::create_directories(parent_path, ec) && - !fs::is_directory(parent_path)) { - LOG_ERROR( - "Failed to create directory '", parent_path, "': ", ec.message()); - return nullptr; - } - } - - return std::make_unique(file_path, file_handle_pool); -} - -std::unique_ptr -zarr::make_s3_sink(std::string_view bucket_name, - std::string_view object_key, - std::shared_ptr connection_pool) -{ - EXPECT(!object_key.empty(), "Object key must not be empty."); - - // bucket name and connection pool are checked in bucket_exists - if (!bucket_exists(bucket_name, connection_pool)) { - LOG_ERROR("Bucket '", bucket_name, "' does not exist."); - return nullptr; - } - - return std::make_unique(bucket_name, object_key, connection_pool); -} - -bool -zarr::make_data_s3_sinks(std::string_view bucket_name, - std::string_view base_path, - const ArrayDimensions& dimensions, - const DimensionPartsFun& parts_along_dimension, - std::shared_ptr connection_pool, - std::vector>& part_sinks) -{ - EXPECT(!base_path.empty(), "Base path must not be empty."); - EXPECT(!bucket_name.empty(), "Bucket name must not be empty."); - - const auto paths = - construct_data_paths(base_path, dimensions, parts_along_dimension); - - return make_s3_sinks(bucket_name, paths, connection_pool, part_sinks); -} diff --git a/src/streaming/zarr.common.cpp b/src/streaming/zarr.common.cpp index 52b7bb49..e176db26 100644 --- a/src/streaming/zarr.common.cpp +++ b/src/streaming/zarr.common.cpp @@ -17,8 +17,7 @@ zarr::trim(std::string_view s) // trim left std::string trimmed(s); - trimmed.erase(trimmed.begin(), - std::find_if(trimmed.begin(), trimmed.end(), [](char c) { + trimmed.erase(trimmed.begin(), std::ranges::find_if(trimmed, [](char c) { return !std::isspace(c); })); diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index 885e9b24..a36e9291 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -2,9 +2,11 @@ #include "array.base.hh" #include "fs.array.hh" #include "fs.multiscale.array.hh" +#include "fs.storage.hh" #include "macros.hh" #include "s3.array.hh" -#include +#include "s3.multiscale.array.hh" +#include "s3.storage.hh" #include "zarr.common.hh" #include "zarr.stream.hh" @@ -806,23 +808,6 @@ check_array_structure(std::vector> arrays, return true; } - -std::string -dimension_type_to_string(ZarrDimensionType type) -{ - switch (type) { - case ZarrDimensionType_Time: - return "time"; - case ZarrDimensionType_Channel: - return "channel"; - case ZarrDimensionType_Space: - return "space"; - case ZarrDimensionType_Other: - return "other"; - default: - return "(unknown)"; - } -} } // namespace /* ZarrStream_s implementation */ @@ -941,46 +926,36 @@ ZarrStream_s::write_custom_metadata(std::string_view custom_metadata, return ZarrStatusCode_InvalidArgument; } - // // check if we have already written custom metadata - // if (!custom_metadata_sink_) { - // const std::string metadata_key = "acquire.json"; - // std::string base_path = store_path_; - // if (base_path.starts_with("file://")) { - // base_path = base_path.substr(7); - // } - // const auto prefix = base_path.empty() ? "" : base_path + "/"; - // const auto sink_path = prefix + metadata_key; - // - // if (is_s3_acquisition_()) { - // custom_metadata_sink_ = zarr::make_s3_sink( - // s3_settings_->bucket_name, sink_path, s3_connection_pool_); - // } else { - // custom_metadata_sink_ = - // zarr::make_file_sink(sink_path, file_handle_pool_); - // } - // } else if (!overwrite) { // custom metadata already written, don't overwrite - // LOG_ERROR("Custom metadata already written, use overwrite flag"); - // return ZarrStatusCode_WillNotOverwrite; - // } - // - // if (!custom_metadata_sink_) { - // LOG_ERROR("Custom metadata sink not found"); - // return ZarrStatusCode_InternalError; - // } - // - // const auto metadata_json = nlohmann::json::parse(custom_metadata, - // nullptr, // callback - // false, // allow exceptions - // true // ignore comments - // ); - // - // const auto metadata_str = metadata_json.dump(4); - // std::span data{ reinterpret_cast(metadata_str.data()), - // metadata_str.size() }; - // if (!custom_metadata_sink_->write(0, data)) { - // LOG_ERROR("Error writing custom metadata"); - // return ZarrStatusCode_IOError; - // } + const std::string prefix = store_path_.empty() ? "" : store_path_ + "/"; + const std::string path = prefix + "acquire.json"; + + // check if we have already written custom metadata + if (custom_metadata_ && + !overwrite) { // custom metadata already written, don't + LOG_ERROR("Custom metadata already written, use overwrite flag"); + return ZarrStatusCode_WillNotOverwrite; + } + + const auto metadata_json = nlohmann::json::parse(custom_metadata, + nullptr, // callback + false, // allow exceptions + true // ignore comments + ); + + custom_metadata_ = metadata_json.dump(4); + + bool success; + if (is_s3_acquisition_()) { + success = write_string_to_s3_( + s3_settings_->bucket_name, path, *custom_metadata_); + } else { + success = write_string_to_file_(path, *custom_metadata_); + } + + if (!success) { + LOG_ERROR("Error writing custom metadata"); + return ZarrStatusCode_IOError; + } return ZarrStatusCode_Success; } @@ -1322,6 +1297,9 @@ bool ZarrStream_s::commit_settings_(const struct ZarrStreamSettings_s* settings) { store_path_ = zarr::trim(settings->store_path); + if (store_path_.starts_with("file://")) { + store_path_ = store_path_.substr(7); + } std::optional bucket_name; s3_settings_ = make_s3_settings(settings->s3_settings); @@ -1432,75 +1410,62 @@ ZarrStream_s::create_store_(bool overwrite) bool ZarrStream_s::write_intermediate_metadata_() { - // std::optional bucket_name; - // if (s3_settings_) { - // bucket_name = s3_settings_->bucket_name; - // } - // - // const nlohmann::json group_metadata = nlohmann::json({ - // { "zarr_format", 3 }, - // { "consolidated_metadata", nullptr }, - // { "node_type", "group" }, - // { "attributes", nlohmann::json::object() }, - // }); - // const std::string metadata_key = "zarr.json"; - // std::string metadata_str; - // - // for (const auto& parent_group_key : intermediate_group_paths_) { - // const std::string relative_path = - // (parent_group_key.empty() ? "" : parent_group_key); - // - // if (auto pit = plates_.find(relative_path); // is it a plate? - // pit != plates_.end()) { - // const auto& plate = pit->second; - // nlohmann::json plate_metadata( - // group_metadata); // make a copy to modify - // - // // not supported for Zarr V2 / NGFF 0.4 - // plate_metadata["attributes"]["ome"] = { - // { "version", "0.5" }, - // { "plate", plate.to_json() }, - // }; - // - // metadata_str = plate_metadata.dump(4); - // } else if (auto wit = wells_.find(relative_path); // is it a well? - // wit != wells_.end()) { - // const auto& well = wit->second; - // nlohmann::json well_metadata( - // group_metadata); // make a copy to modify - // - // // not supported for Zarr V2 / NGFF 0.4 - // well_metadata["attributes"]["ome"] = { - // { "version", "0.5" }, - // { "well", well.to_json() }, - // }; - // - // metadata_str = well_metadata.dump(4); - // } else { // generic group - // metadata_str = group_metadata.dump(4); - // } - // - // ConstByteSpan metadata_span( - // reinterpret_cast(metadata_str.data()), - // metadata_str.size()); - // - // const std::string sink_path = - // store_path_ + "/" + relative_path + "/" + metadata_key; - // std::unique_ptr metadata_sink; - // if (is_s3_acquisition_()) { - // metadata_sink = zarr::make_s3_sink( - // bucket_name.value(), sink_path, s3_connection_pool_); - // } else { - // metadata_sink = zarr::make_file_sink(sink_path, file_handle_pool_); - // } - // - // if (!metadata_sink->write(0, metadata_span) || - // !zarr::finalize_sink(std::move(metadata_sink))) { - // set_error_("Failed to write intermediate metadata for group '" + - // parent_group_key + "'"); - // return false; - // } - // } + const nlohmann::json group_metadata = nlohmann::json({ + { "zarr_format", 3 }, + { "consolidated_metadata", nullptr }, + { "node_type", "group" }, + { "attributes", nlohmann::json::object() }, + }); + const std::string metadata_key = "zarr.json"; + std::string metadata_str; + + for (const auto& parent_group_key : intermediate_group_paths_) { + const std::string relative_path = + (parent_group_key.empty() ? "" : parent_group_key); + + if (auto pit = plates_.find(relative_path); // is it a plate? + pit != plates_.end()) { + const auto& plate = pit->second; + nlohmann::json plate_metadata( + group_metadata); // make a copy to modify + + // not supported for Zarr V2 / NGFF 0.4 + plate_metadata["attributes"]["ome"] = { + { "version", "0.5" }, + { "plate", plate.to_json() }, + }; + + metadata_str = plate_metadata.dump(4); + } else if (auto wit = wells_.find(relative_path); // is it a well? + wit != wells_.end()) { + const auto& well = wit->second; + nlohmann::json well_metadata( + group_metadata); // make a copy to modify + + // not supported for Zarr V2 / NGFF 0.4 + well_metadata["attributes"]["ome"] = { + { "version", "0.5" }, + { "well", well.to_json() }, + }; + + metadata_str = well_metadata.dump(4); + } else { // generic group + metadata_str = group_metadata.dump(4); + } + + const std::string path = + store_path_ + "/" + relative_path + "/" + metadata_key; + if (is_s3_acquisition_()) { + if (!write_string_to_s3_( + s3_settings_->bucket_name, path, metadata_str)) { + return false; + } + } else { + if (!write_string_to_file_(path, metadata_str)) { + return false; + } + } + } return true; } @@ -1524,7 +1489,7 @@ ZarrStream_s::init_frame_queue_() } // cap the frame buffer at 1 GiB, or 10 frames, whichever is larger - const auto buffer_size_bytes = 1ULL << 30; + constexpr auto buffer_size_bytes = 1ULL << 30; const auto frame_count = std::max(10ULL, buffer_size_bytes / frame_size_bytes); @@ -1666,12 +1631,6 @@ finalize_stream(struct ZarrStream_s* stream) // thread stream->thread_pool_->await_stop(); - // if (stream->custom_metadata_sink_ && - // !zarr::finalize_sink(std::move(stream->custom_metadata_sink_))) { - // LOG_ERROR( - // "Error finalizing Zarr stream. Failed to write custom metadata"); - // } - for (auto& [key, output] : stream->output_arrays_) { if (!zarr::finalize_array(std::move(output.array))) { LOG_ERROR( @@ -1689,3 +1648,25 @@ finalize_stream(struct ZarrStream_s* stream) return true; } + +bool +ZarrStream_s::write_string_to_file_(const std::string& path, + const std::string& data) const +{ + EXPECT(file_handle_pool_ != nullptr, "File handle pool is not initialized"); + + zarr::FSStorage storage(file_handle_pool_); + return storage.write_string(path, data, 0); +} + +bool +ZarrStream_s::write_string_to_s3_(const std::string& bucket_name, + const std::string& key, + const std::string& data) const +{ + EXPECT(s3_connection_pool_ != nullptr, + "S3 connection pool is not initialized"); + + zarr::S3Storage storage(bucket_name, s3_connection_pool_); + return storage.write_string(key, data, 0) && storage.finalize_object(key); +} diff --git a/src/streaming/zarr.stream.hh b/src/streaming/zarr.stream.hh index 86e6365d..59fd0186 100644 --- a/src/streaming/zarr.stream.hh +++ b/src/streaming/zarr.stream.hh @@ -10,7 +10,7 @@ #include "thread.pool.hh" #include -#include // unique_ptr +#include // unique_ptr #include #include #include @@ -19,7 +19,7 @@ struct ZarrStream_s { public: - ZarrStream_s(struct ZarrStreamSettings_s* settings); + explicit ZarrStream_s(struct ZarrStreamSettings_s* settings); /** * @brief Append data to the stream with a specific key. @@ -79,7 +79,7 @@ struct ZarrStream_s std::shared_ptr s3_connection_pool_; std::shared_ptr file_handle_pool_; - // std::unique_ptr custom_metadata_sink_; + std::optional custom_metadata_; bool is_s3_acquisition_() const; @@ -154,6 +154,15 @@ struct ZarrStream_s /** @brief Wait for the frame queue to finish processing. */ void finalize_frame_queue_(); + /** @brief Write a string @p data to a file @p path. */ + bool write_string_to_file_(const std::string& path, + const std::string& data) const; + + /** @brief Write a string @p data to an S3 object @p key on @p bucket. */ + bool write_string_to_s3_(const std::string& bucket_name, + const std::string& key, + const std::string& data) const; + friend bool finalize_stream(struct ZarrStream_s* stream); }; diff --git a/tests/unit-tests/array-dimensions-chunk-internal-offset.cpp b/tests/unit-tests/array-dimensions-chunk-internal-offset.cpp index 531389db..0ce05270 100644 --- a/tests/unit-tests/array-dimensions-chunk-internal-offset.cpp +++ b/tests/unit-tests/array-dimensions-chunk-internal-offset.cpp @@ -10,11 +10,11 @@ main() std::vector dims; dims.emplace_back( - "t", ZarrDimensionType_Time, 0, 5, 0); // 5 timepoints / chunk - dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 0); // 2 chunks - dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 0); // 3 chunks - dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 0); // 3 chunks - dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 0); // 4 chunks + "t", ZarrDimensionType_Time, 0, 5, 1); // 5 timepoints / chunk + dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 1); // 2 chunks + dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 1); // 3 chunks + dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 1); // 3 chunks + dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 1); // 4 chunks ArrayDimensions dimensions(std::move(dims), ZarrDataType_uint16); try { diff --git a/tests/unit-tests/array-dimensions-chunk-lattice-index.cpp b/tests/unit-tests/array-dimensions-chunk-lattice-index.cpp index 53bbcbb0..d8de6859 100644 --- a/tests/unit-tests/array-dimensions-chunk-lattice-index.cpp +++ b/tests/unit-tests/array-dimensions-chunk-lattice-index.cpp @@ -11,11 +11,11 @@ main() try { std::vector dims; dims.emplace_back( - "t", ZarrDimensionType_Time, 0, 5, 0); // 5 timepoints / chunk - dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 0); // 2 chunks - dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 0); // 3 chunks - dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 0); // 3 chunks - dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 0); // 4 chunks + "t", ZarrDimensionType_Time, 0, 5, 1); // 5 timepoints / chunk + dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 1); // 2 chunks + dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 1); // 3 chunks + dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 1); // 3 chunks + dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 1); // 4 chunks ArrayDimensions dimensions(std::move(dims), ZarrDataType_uint8); EXPECT_EQ(int, dimensions.chunk_lattice_index(0, 2), 0); diff --git a/tests/unit-tests/array-dimensions-tile-group-offset.cpp b/tests/unit-tests/array-dimensions-tile-group-offset.cpp index c62368db..ca0dd2f2 100644 --- a/tests/unit-tests/array-dimensions-tile-group-offset.cpp +++ b/tests/unit-tests/array-dimensions-tile-group-offset.cpp @@ -10,11 +10,11 @@ main() std::vector dims; dims.emplace_back( - "t", ZarrDimensionType_Time, 0, 5, 0); // 5 timepoints / chunk - dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 0); // 2 chunks - dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 0); // 3 chunks - dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 0); // 3 chunks - dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 0); // 4 chunks + "t", ZarrDimensionType_Time, 0, 5, 1); // 5 timepoints / chunk + dims.emplace_back("c", ZarrDimensionType_Channel, 3, 2, 1); // 2 chunks + dims.emplace_back("z", ZarrDimensionType_Space, 5, 2, 1); // 3 chunks + dims.emplace_back("y", ZarrDimensionType_Space, 48, 16, 1); // 3 chunks + dims.emplace_back("x", ZarrDimensionType_Space, 64, 16, 1); // 4 chunks ArrayDimensions dimensions(std::move(dims), ZarrDataType_float32); try { diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index 2ee0137e..fd138d98 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -12,41 +12,42 @@ namespace fs = std::filesystem; namespace { const fs::path base_dir = fs::temp_directory_path() / TEST; -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; -const unsigned int n_frames = array_planes * array_channels * array_timepoints; +constexpr unsigned int array_width = 64, array_height = 48, array_planes = 6, + array_channels = 8, array_timepoints = 10; +constexpr unsigned int n_frames = + array_planes * array_channels * array_timepoints; -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; +constexpr unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, + chunk_channels = 4, chunk_timepoints = 5; -const unsigned int shard_width = 2, shard_height = 1, shard_planes = 1, - shard_channels = 2, shard_timepoints = 2; -const unsigned int chunks_per_shard = +constexpr unsigned int shard_width = 2, shard_height = 1, shard_planes = 1, + shard_channels = 2, shard_timepoints = 2; +constexpr unsigned int chunks_per_shard = shard_width * shard_height * shard_planes * shard_channels * shard_timepoints; -const unsigned int chunks_in_x = +constexpr unsigned int chunks_in_x = (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = +constexpr unsigned int chunks_in_y = (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = +constexpr unsigned int chunks_in_z = (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = +constexpr unsigned int chunks_in_c = (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = +constexpr unsigned int chunks_in_t = (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; -const unsigned int shards_in_x = +constexpr unsigned int shards_in_x = (chunks_in_x + shard_width - 1) / shard_width; // 2 shards -const unsigned int shards_in_y = +constexpr unsigned int shards_in_y = (chunks_in_y + shard_height - 1) / shard_height; // 3 shards -const unsigned int shards_in_z = +constexpr unsigned int shards_in_z = (chunks_in_z + shard_planes - 1) / shard_planes; // 3 shards -const unsigned int shards_in_c = +constexpr unsigned int shards_in_c = (chunks_in_c + shard_channels - 1) / shard_channels; // 1 shard -const unsigned int shards_in_t = +constexpr unsigned int shards_in_t = (chunks_in_t + shard_timepoints - 1) / shard_timepoints; // 1 shard -const int level_of_detail = 3; +constexpr int level_of_detail = 3; } // namespace void @@ -105,8 +106,7 @@ main() // try { auto thread_pool = std::make_shared( - std::thread::hardware_concurrency(), - [](const std::string& err) { LOG_ERROR("Error: ", err); }); + 0, [](const std::string& err) { LOG_ERROR("Error: ", err); }); std::vector dims; dims.emplace_back("t", @@ -132,7 +132,7 @@ main() dims.emplace_back( "x", ZarrDimensionType_Space, array_width, chunk_width, shard_width); - auto config = std::make_shared( + const auto config = std::make_shared( base_dir.string(), "", std::nullopt, @@ -142,11 +142,10 @@ main() std::nullopt, level_of_detail); + // write the data { auto writer = std::make_unique( - config, - thread_pool, - std::make_shared()); + config, thread_pool, std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); From 5531bd2d840393fc5cee4910c21783234d7de6c0 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 13:51:14 +0200 Subject: [PATCH 17/38] #include where appropriate --- src/streaming/acquire.zarr.cpp | 1 + src/streaming/fs.array.cpp | 1 + src/streaming/s3.array.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/src/streaming/acquire.zarr.cpp b/src/streaming/acquire.zarr.cpp index dac07584..460c36ee 100644 --- a/src/streaming/acquire.zarr.cpp +++ b/src/streaming/acquire.zarr.cpp @@ -4,6 +4,7 @@ #include "zarr.stream.hh" #include // bit_ceil +#include // memcpy #include // uint32_t #include #include diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index d49147de..df68e7ec 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -3,6 +3,7 @@ #include +#include // memcp #include #include #include diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index 72eb7ed0..d25e1b89 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -3,6 +3,7 @@ #include +#include // memcpy #include zarr::S3Array::S3Array(std::shared_ptr config, From 2f92a0c2b728f9fd275b32dfdd64e65dc3945c81 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 17:33:32 +0200 Subject: [PATCH 18/38] kill -9 Sink --- benchmarks/benchmark.py | 12 ++- python/acquire-zarr-py.cpp | 2 +- setup.py | 2 +- src/streaming/array.cpp | 162 ++++++++++++++++------------------ src/streaming/array.hh | 14 +-- src/streaming/fs.array.cpp | 138 +++++++++++++++++++++-------- src/streaming/fs.array.hh | 5 ++ src/streaming/fs.storage.cpp | 8 +- src/streaming/s3.array.cpp | 35 ++++++-- src/streaming/s3.storage.cpp | 8 +- src/streaming/zarr.stream.cpp | 40 +++++++-- 11 files changed, 270 insertions(+), 156 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index b6c8adb4..d72be017 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -150,15 +150,23 @@ def run_acquire_zarr_test( elapsed_times = [] total_start = time.perf_counter_ns() + chunk = np.empty((tchunk_size, 2048, 2048), dtype=np.uint16) for i in range(data.shape[0]): start_plane = time.perf_counter_ns() - stream.append(data[i]) + chunk_idx = i % tchunk_size + chunk[chunk_idx] = data[i] + if chunk_idx == tchunk_size - 1: + stream.append(chunk) elapsed = time.perf_counter_ns() - start_plane elapsed_times.append(elapsed) print(f"Acquire-zarr: Plane {i} written in {elapsed / 1e6:.3f} ms") # Close (or flush) the stream to finalize writes. - del stream + start_close = time.perf_counter_ns() + stream.close() + elapsed = time.perf_counter_ns() - start_close + elapsed_times.append(elapsed) + print(f"Acquire-zarr: Final close took {elapsed / 1e6:.3f} ms") total_elapsed = time.perf_counter_ns() - total_start tot_ms = total_elapsed / 1e6 print(f"Acquire-zarr: Total write time: {tot_ms:.3f} ms") diff --git a/python/acquire-zarr-py.cpp b/python/acquire-zarr-py.cpp index 73ea1d5b..220781f5 100644 --- a/python/acquire-zarr-py.cpp +++ b/python/acquire-zarr-py.cpp @@ -1087,7 +1087,7 @@ class PyZarrStream } auto buf = contiguous_data.request(); - auto* ptr = (uint8_t*)buf.ptr; + auto* ptr = static_cast(buf.ptr); py::gil_scoped_release release; diff --git a/setup.py b/setup.py index 17c062b5..c9599a22 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def build_extension(self, ext): build_dir = os.path.abspath(os.path.join(ext.sourcedir, "build")) - cfg = "Debug" # if self.debug else "Release" + cfg = "Debug" if self.debug else "Release" cmake_args = [ "--preset=default", diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 96d987a2..bee556ac 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -2,6 +2,7 @@ #include "macros.hh" #include "zarr.common.hh" +#include #include #include // std::fill @@ -71,7 +72,7 @@ zarr::Array::Array(std::shared_ptr config, { const size_t n_chunks = config_->dimensions->number_of_chunks_in_memory(); EXPECT(n_chunks > 0, "Array has zero chunks in memory"); - chunk_buffers_ = std::vector(n_chunks); + chunk_buffers_ = std::vector>(n_chunks); const auto& dims = config_->dimensions; const auto number_of_shards = dims->number_of_shards(); @@ -334,7 +335,8 @@ zarr::Array::fill_buffers_() const auto n_bytes = config_->dimensions->bytes_per_chunk(); for (auto& buf : chunk_buffers_) { - buf.resize_and_fill(n_bytes, 0); + buf.resize(n_bytes); // no-op if already that size + std::ranges::fill(buf, 0); } } @@ -382,64 +384,49 @@ zarr::Array::write_frame_to_chunks_(LockedBuffer& data) #pragma omp parallel for reduction(+ : bytes_written) for (auto tile = 0; tile < n_tiles; ++tile) { auto& chunk_buffer = chunk_buffers_[tile + group_offset]; - bytes_written += chunk_buffer.with_lock([chunk_offset, - frame_rows, - frame_cols, - tile_rows, - tile_cols, - tile, - n_tiles_x, - bytes_per_px, - bytes_per_row, - bytes_per_chunk, - &frame](auto& chunk_data) { - const auto* data_ptr = frame.data(); - const auto data_size = frame.size(); - - const auto chunk_start = chunk_data.data(); - - const auto tile_idx_y = tile / n_tiles_x; - const auto tile_idx_x = tile % n_tiles_x; - - auto chunk_pos = chunk_offset; - size_t bytes_written = 0; - - for (auto k = 0; k < tile_rows; ++k) { - const auto frame_row = tile_idx_y * tile_rows + k; - if (frame_row < frame_rows) { - const auto frame_col = tile_idx_x * tile_cols; - - const auto region_width = - std::min(frame_col + tile_cols, frame_cols) - frame_col; - - const auto region_start = - bytes_per_px * (frame_row * frame_cols + frame_col); - const auto nbytes = region_width * bytes_per_px; - - // copy region - EXPECT(region_start + nbytes <= data_size, - "Buffer overflow in framme. Region start: ", - region_start, - " nbytes: ", - nbytes, - " data size: ", - data_size); - EXPECT(chunk_pos + nbytes <= bytes_per_chunk, - "Buffer overflow in chunk. Chunk pos: ", - chunk_pos, - " nbytes: ", - nbytes, - " bytes per chunk: ", - bytes_per_chunk); - memcpy( - chunk_start + chunk_pos, data_ptr + region_start, nbytes); - bytes_written += nbytes; - } - chunk_pos += bytes_per_row; + const auto* data_ptr = frame.data(); + const auto data_size = frame.size(); + + const auto chunk_start = chunk_buffer.data(); + + const auto tile_idx_y = tile / n_tiles_x; + const auto tile_idx_x = tile % n_tiles_x; + + auto chunk_pos = chunk_offset; + + for (auto k = 0; k < tile_rows; ++k) { + const auto frame_row = tile_idx_y * tile_rows + k; + if (frame_row < frame_rows) { + const auto frame_col = tile_idx_x * tile_cols; + + const auto region_width = + std::min(frame_col + tile_cols, frame_cols) - frame_col; + + const auto region_start = + bytes_per_px * (frame_row * frame_cols + frame_col); + const auto nbytes = region_width * bytes_per_px; + + // copy region + EXPECT(region_start + nbytes <= data_size, + "Buffer overflow in framme. Region start: ", + region_start, + " nbytes: ", + nbytes, + " data size: ", + data_size); + EXPECT(chunk_pos + nbytes <= bytes_per_chunk, + "Buffer overflow in chunk. Chunk pos: ", + chunk_pos, + " nbytes: ", + nbytes, + " bytes per chunk: ", + bytes_per_chunk); + memcpy( + chunk_start + chunk_pos, data_ptr + region_start, nbytes); + bytes_written += nbytes; } - - return bytes_written; - }); + chunk_pos += bytes_per_row; + } } data.assign(std::move(frame)); @@ -447,8 +434,8 @@ zarr::Array::write_frame_to_chunks_(LockedBuffer& data) return bytes_written; } -ByteVector -zarr::Array::consolidate_chunks_(uint32_t shard_index) +zarr::Array::ShardLayer +zarr::Array::collect_chunks_(uint32_t shard_index) { const auto& dims = config_->dimensions; CHECK(shard_index < dims->number_of_shards()); @@ -467,7 +454,6 @@ zarr::Array::consolidate_chunks_(uint32_t shard_index) uint64_t last_chunk_offset = shard_table[2 * layer_offset]; uint64_t last_chunk_size = shard_table[2 * layer_offset + 1]; - size_t shard_size = last_chunk_size; for (auto i = 1; i < chunks_per_layer; ++i) { const auto offset_idx = 2 * (layer_offset + i); @@ -479,30 +465,19 @@ zarr::Array::consolidate_chunks_(uint32_t shard_index) shard_table[offset_idx] = last_chunk_offset + last_chunk_size; last_chunk_offset = shard_table[offset_idx]; last_chunk_size = shard_table[size_idx]; - shard_size += last_chunk_size; } - std::vector shard_layer(shard_size); - const auto chunk_indices_this_layer = dims->chunk_indices_for_shard_layer(shard_index, current_layer_); - size_t offset = 0; - for (const auto& idx : chunk_indices_this_layer) { - // this clears the chunk data out of the LockedBuffer - const auto chunk = chunk_buffers_[idx - chunk_offset].take(); - std::copy(chunk.begin(), chunk.end(), shard_layer.begin() + offset); + ShardLayer layer{ file_offset, {} }; + layer.chunks.reserve(chunk_indices_this_layer.size()); - offset += chunk.size(); + for (const auto& idx : chunk_indices_this_layer) { + layer.chunks.emplace_back(chunk_buffers_[idx - chunk_offset]); } - EXPECT(offset == shard_size, - "Consolidated shard size does not match expected: ", - offset, - " != ", - shard_size); - - return std::move(shard_layer); + return std::move(layer); } bool @@ -595,16 +570,35 @@ zarr::Array::compress_chunks_() bool success = false; try { - if (!chunk_buffer.compress(params, bytes_per_px)) { - err = "Failed to compress chunk " + + std::vector compressed_data(chunk_buffer.size() + + BLOSC_MAX_OVERHEAD); + const auto n_bytes_compressed = + blosc_compress_ctx(params.clevel, + params.shuffle, + bytes_per_px, + chunk_buffer.size(), + chunk_buffer.data(), + compressed_data.data(), + compressed_data.size(), + params.codec_id.c_str(), + 0, + 1); + + if (n_bytes_compressed <= 0) { + err = "blosc_compress_ctx failed with code " + + std::to_string(n_bytes_compressed) + " for chunk " + std::to_string(chunk_idx) + " (internal index " + std::to_string(internal_idx) + " of shard " + std::to_string(shard_idx) + ")"; + success = false; + } else { + compressed_data.resize(n_bytes_compressed); + chunk_buffer.swap(compressed_data); + + // update shard table with size + shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); + success = true; } - - // update shard table with size - shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); - success = true; } catch (const std::exception& exc) { err = exc.what(); } diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 40c1852b..6fa8d0b5 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -19,27 +19,27 @@ class Array : public ArrayBase [[nodiscard]] size_t write_frame(LockedBuffer&) override; protected: - /// Buffering - std::vector chunk_buffers_; + struct ShardLayer + { + size_t offset; // offset in bytes from start of shard + std::vector> chunks; + }; - /// Filesystem + std::vector> chunk_buffers_; std::vector data_paths_; - /// Bookkeeping uint64_t bytes_to_flush_; uint32_t frames_written_; uint32_t append_chunk_index_; std::string data_root_; bool is_closing_; - /// Sharding uint32_t current_layer_; std::vector shard_file_offsets_; std::vector> shard_tables_; bool make_metadata_(std::string& metadata) override; [[nodiscard]] bool close_() override; - [[nodiscard]] bool close_impl_(); void make_data_paths_(); void fill_buffers_(); @@ -49,7 +49,7 @@ class Array : public ArrayBase size_t write_frame_to_chunks_(LockedBuffer& data); - [[nodiscard]] ByteVector consolidate_chunks_(uint32_t shard_index); + [[nodiscard]] ShardLayer collect_chunks_(uint32_t shard_index); [[nodiscard]] bool compress_and_flush_data_(); [[nodiscard]] bool compress_chunks_(); void update_table_entries_(); diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index df68e7ec..f343059e 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -8,6 +8,15 @@ #include #include +void* +make_flags(); + +void +destroy_flags(void* flags); + +bool +seek_and_write(void* handle, size_t offset, ConstByteSpan data); + namespace fs = std::filesystem; namespace { @@ -139,50 +148,76 @@ zarr::FSArray::flush_data_() const std::string data_path = data_paths_[shard_idx]; auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); + const auto shard_data = collect_chunks_(shard_idx); + if (shard_data.chunks.empty()) { + LOG_ERROR("Failed to collect chunks for shard ", shard_idx); + return false; + } + if (shard_data.offset != *file_offset) { + LOG_ERROR("Inconsistent file offset for shard ", + shard_idx, + ": expected ", + *file_offset, + ", got ", + shard_data.offset); + return false; + } - auto job = - [shard_idx, data_path, file_offset, promise, &all_successful, this]( - std::string& err) { - bool success = true; - - try { - // consolidate chunks in shard - if (const auto shard_data = consolidate_chunks_(shard_idx); - !write_binary(data_path, shard_data, *file_offset)) { - err = "Failed to write shard at path " + data_path; - success = false; - } else { - *file_offset += shard_data.size(); - } - } catch (const std::exception& exc) { - err = "Failed to flush data: " + std::string(exc.what()); - success = false; - } - - all_successful.fetch_and(success); - promise->set_value(); - - return success; - }; - - // one thread is reserved for processing the frame queue and runs the - // entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); + size_t layer_offset = shard_data.offset; + + for (auto& chunk : shard_data.chunks) { + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + const auto handle = get_handle_(data_path); + if (handle == nullptr) { + LOG_ERROR("Failed to get file handle for ", data_path); + return false; + } + + const auto chunk_size = chunk.size(); // we move it below + auto job = [data_path, + handle, + layer_offset, + chunk = std::move(chunk), + promise](std::string& err) { + bool success; + try { + success = seek_and_write(handle.get(), layer_offset, chunk); + } catch (const std::exception& exc) { + err = "Failed to write chunk at offset " + + std::to_string(layer_offset) + " to path " + + data_path + ": " + exc.what(); + success = false; + } + + promise->set_value(); + return success; + }; + + // one thread is reserved for processing the frame queue and runs + // the entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || + !thread_pool_->push_job(job)) { + std::string err; + if (!job(err)) { + LOG_ERROR(err); + } } + + layer_offset += chunk_size; } - } - // wait for all threads to finish - for (auto& future : futures) { - future.wait(); + *file_offset = layer_offset; } - return static_cast(all_successful); + // wait for all threads to finish + // for (auto& future : futures) { + // future.wait(); + // } + // + // return static_cast(all_successful); + return true; } bool @@ -211,13 +246,22 @@ zarr::FSArray::flush_tables_() std::string data_path = data_paths_[shard_idx]; - if (!write_binary(data_path, table, *file_offset)) { + const auto handle = get_handle_(data_path); + if (handle == nullptr) { + LOG_ERROR("Failed to get file handle for ", data_path); + return false; + } + + if (!seek_and_write(handle.get(), *file_offset, table)) { LOG_ERROR("Failed to write table and checksum to shard ", shard_idx, " at path ", data_path); return false; } + *file_offset += table.size(); + + handles_.erase(data_path); // close the handle } // don't reset state if we're closing @@ -240,3 +284,19 @@ zarr::FSArray::close_io_streams_() data_paths_.clear(); } + +std::shared_ptr +zarr::FSArray::get_handle_(const std::string& path) +{ + std::unique_lock lock(mutex_); + if (handles_.contains(path)) { + return handles_[path]; + } + + void* flags = make_flags(); + const auto handle = file_handle_pool_->get_handle(path, flags); + destroy_flags(flags); + + handles_.emplace(path, handle); + return handle; +} diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 5b289d10..2322eba7 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -14,10 +14,15 @@ class FSArray final std::shared_ptr file_handle_pool); protected: + std::mutex mutex_; + std::unordered_map> handles_; + bool write_metadata_() override; bool flush_data_() override; bool flush_tables_() override; void close_io_streams_() override; + + std::shared_ptr get_handle_(const std::string& path); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/fs.storage.cpp b/src/streaming/fs.storage.cpp index c3d749b9..3e58245a 100644 --- a/src/streaming/fs.storage.cpp +++ b/src/streaming/fs.storage.cpp @@ -19,8 +19,8 @@ zarr::FSStorage::FSStorage(std::shared_ptr file_handle_pool) bool zarr::FSStorage::write_binary(const std::string& path, - const std::vector& data, - size_t offset) const + const std::vector& data, + size_t offset) const { void* flags = make_flags(); const auto handle = file_handle_pool_->get_handle(path, flags); @@ -41,8 +41,8 @@ zarr::FSStorage::write_binary(const std::string& path, bool zarr::FSStorage::write_string(const std::string& path, - const std::string& data, - size_t offset) const + const std::string& data, + size_t offset) const { void* flags = make_flags(); const auto handle = file_handle_pool_->get_handle(path, flags); diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index d25e1b89..a13b10d7 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -68,14 +68,35 @@ zarr::S3Array::flush_data_() bool success = true; try { - // consolidate chunks in shard - if (const auto shard_data = consolidate_chunks_(shard_idx); - !write_binary(data_path, shard_data, *file_offset)) { - err = "Failed to write shard at path " + data_path; - success = false; - } else { - *file_offset += shard_data.size(); + const auto shard_data = collect_chunks_(shard_idx); + if (shard_data.chunks.empty()) { + LOG_ERROR("Failed to collect chunks for shard ", + shard_idx); + return false; } + if (shard_data.offset != *file_offset) { + LOG_ERROR("Inconsistent file offset for shard ", + shard_idx, + ": expected ", + *file_offset, + ", got ", + shard_data.offset); + return false; + } + + size_t layer_offset = shard_data.offset; + for (auto& chunk : shard_data.chunks) { + if (!write_binary(data_path, chunk, layer_offset)) { + err = "Failed to write chunk " + + std::to_string(shard_idx) + " at offset " + + std::to_string(layer_offset) + " to path " + + data_path; + success = false; + break; + } + layer_offset += chunk.size(); + } + *file_offset = layer_offset; } catch (const std::exception& exc) { err = "Failed to flush data: " + std::string(exc.what()); success = false; diff --git a/src/streaming/s3.storage.cpp b/src/streaming/s3.storage.cpp index 1e766f95..3455cac8 100644 --- a/src/streaming/s3.storage.cpp +++ b/src/streaming/s3.storage.cpp @@ -42,8 +42,8 @@ zarr::S3Storage::create_s3_object_(const std::string& key) bool zarr::S3Storage::write_binary(const std::string& key, - const std::vector& data, - size_t offset) + const std::vector& data, + size_t offset) { create_s3_object_(key); @@ -59,8 +59,8 @@ zarr::S3Storage::write_binary(const std::string& key, bool zarr::S3Storage::write_string(const std::string& key, - const std::string& data, - size_t offset) + const std::string& data, + size_t offset) { create_s3_object_(key); diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index a36e9291..9a7c7216 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -1531,7 +1531,7 @@ ZarrStream_s::process_frame_queue_() std::string output_key; zarr::LockedBuffer frame; - while (process_frames_ || !frame_queue_->empty()) { + while (process_frames_) { { std::unique_lock lock(frame_queue_mutex_); while (frame_queue_->empty() && process_frames_) { @@ -1546,9 +1546,10 @@ ZarrStream_s::process_frame_queue_() // done if (!process_frames_) { break; - } else { - continue; } + + // spurious wakeup, go back to waiting + continue; } } @@ -1587,11 +1588,36 @@ ZarrStream_s::process_frame_queue_() } } + // finish frame queue if (!frame_queue_->empty()) { - LOG_WARNING("Reached end of frame queue processing with ", - frame_queue_->size(), - " frames remaining on queue"); - frame_queue_->clear(); + const size_t frames_remaining = frame_queue_->size(); + for (size_t i = 0; i < frames_remaining; ++i) { + if (!frame_queue_->pop(frame, output_key)) { + continue; + } + + if (auto it = output_arrays_.find(output_key); + it == output_arrays_.end()) { + // If we have gotten here, something has gone seriously wrong + set_error_("Output node not found for key: '" + output_key + + "'"); + std::unique_lock lock(frame_queue_mutex_); + frame_queue_finished_cv_.notify_all(); + return; + } else { + auto& output_node = it->second; + + if (output_node.array->write_frame(frame) != frame.size()) { + set_error_("Failed to write frame to writer for key: " + + output_key); + std::unique_lock lock(frame_queue_mutex_); + frame_queue_finished_cv_.notify_all(); + return; + } + } + } + + frame_queue_empty_cv_.notify_all(); // queue is now empty } std::unique_lock lock(frame_queue_mutex_); From 97fa8b4f83f755cfbeeaccb115846258374bb9ed Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 18:05:21 +0200 Subject: [PATCH 19/38] Remove LockedBuffer --- src/streaming/CMakeLists.txt | 2 - src/streaming/array.base.hh | 3 +- src/streaming/array.cpp | 8 +- src/streaming/array.hh | 5 +- src/streaming/downsampler.cpp | 166 +++++++++--------- src/streaming/downsampler.hh | 4 +- src/streaming/frame.queue.cpp | 4 +- src/streaming/frame.queue.hh | 7 +- src/streaming/locked.buffer.cpp | 130 -------------- src/streaming/locked.buffer.hh | 99 ----------- src/streaming/multiscale.array.cpp | 6 +- src/streaming/multiscale.array.hh | 4 +- src/streaming/zarr.stream.cpp | 16 +- src/streaming/zarr.stream.hh | 3 +- tests/unit-tests/array-write-even.cpp | 2 +- .../array-write-ragged-append-dim.cpp | 2 +- .../array-write-ragged-internal-dim.cpp | 2 +- tests/unit-tests/downsampler-odd-z.cpp | 41 ++--- tests/unit-tests/downsampler.cpp | 136 +++++++------- tests/unit-tests/frame-queue.cpp | 35 ++-- 20 files changed, 220 insertions(+), 455 deletions(-) delete mode 100644 src/streaming/locked.buffer.cpp delete mode 100644 src/streaming/locked.buffer.hh diff --git a/src/streaming/CMakeLists.txt b/src/streaming/CMakeLists.txt index c1c53251..dbbf858e 100644 --- a/src/streaming/CMakeLists.txt +++ b/src/streaming/CMakeLists.txt @@ -11,8 +11,6 @@ add_library(${tgt} acquire.zarr.cpp array.dimensions.hh array.dimensions.cpp - locked.buffer.hh - locked.buffer.cpp frame.queue.hh frame.queue.cpp downsampler.hh diff --git a/src/streaming/array.base.hh b/src/streaming/array.base.hh index 73bd2473..2a78ec55 100644 --- a/src/streaming/array.base.hh +++ b/src/streaming/array.base.hh @@ -2,7 +2,6 @@ #include "array.dimensions.hh" #include "blosc.compression.params.hh" -#include "locked.buffer.hh" #include "thread.pool.hh" #include "zarr.types.h" @@ -73,7 +72,7 @@ class ArrayBase * @param data The data to write. * @return The number of bytes successfully written. */ - [[nodiscard]] virtual size_t write_frame(LockedBuffer& data) = 0; + [[nodiscard]] virtual size_t write_frame(std::vector& data) = 0; protected: std::shared_ptr config_; diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index bee556ac..f1449569 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -101,7 +101,7 @@ zarr::Array::memory_usage() const noexcept } size_t -zarr::Array::write_frame(LockedBuffer& data) +zarr::Array::write_frame(std::vector& data) { const auto nbytes_data = data.size(); const auto nbytes_frame = @@ -341,7 +341,7 @@ zarr::Array::fill_buffers_() } size_t -zarr::Array::write_frame_to_chunks_(LockedBuffer& data) +zarr::Array::write_frame_to_chunks_(std::vector& data) { // break the frame into tiles and write them to the chunk buffers const auto bytes_per_px = bytes_of_type(config_->dtype); @@ -379,7 +379,7 @@ zarr::Array::write_frame_to_chunks_(LockedBuffer& data) size_t bytes_written = 0; const auto n_tiles = n_tiles_x * n_tiles_y; - auto frame = data.take(); + std::vector frame = std::move(data); #pragma omp parallel for reduction(+ : bytes_written) for (auto tile = 0; tile < n_tiles; ++tile) { @@ -429,7 +429,7 @@ zarr::Array::write_frame_to_chunks_(LockedBuffer& data) } } - data.assign(std::move(frame)); + data = std::move(frame); return bytes_written; } diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 6fa8d0b5..f6820674 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -2,7 +2,6 @@ #include "array.base.hh" #include "definitions.hh" -#include "locked.buffer.hh" #include "thread.pool.hh" namespace zarr { @@ -16,7 +15,7 @@ class Array : public ArrayBase size_t memory_usage() const noexcept override; - [[nodiscard]] size_t write_frame(LockedBuffer&) override; + [[nodiscard]] size_t write_frame(std::vector&) override; protected: struct ShardLayer @@ -47,7 +46,7 @@ class Array : public ArrayBase bool should_flush_() const; bool should_rollover_() const; - size_t write_frame_to_chunks_(LockedBuffer& data); + size_t write_frame_to_chunks_(std::vector& data); [[nodiscard]] ShardLayer collect_chunks_(uint32_t shard_index); [[nodiscard]] bool compress_and_flush_data_(); diff --git a/src/streaming/downsampler.cpp b/src/streaming/downsampler.cpp index d0441569..4d9698d9 100644 --- a/src/streaming/downsampler.cpp +++ b/src/streaming/downsampler.cpp @@ -303,111 +303,109 @@ zarr::Downsampler::Downsampler(std::shared_ptr config, } void -zarr::Downsampler::add_frame(LockedBuffer& frame) +zarr::Downsampler::add_frame(std::vector& frame) { const auto& base_dims = writer_configurations_[0]->dimensions; size_t frame_width = base_dims->width_dim().array_size_px; size_t frame_height = base_dims->height_dim().array_size_px; - frame.with_lock([&](const auto& data) { - ByteVector current_frame(data.begin(), data.end()); - ByteVector next_level_frame; - - for (auto level = 1; level < n_levels_(); ++level) { - const auto& prev_dims = - writer_configurations_[level - 1]->dimensions; - const auto prev_width = prev_dims->width_dim().array_size_px; - const auto prev_height = prev_dims->height_dim().array_size_px; - const auto prev_planes = - prev_dims->at(prev_dims->ndims() - 3).array_size_px; - - EXPECT(prev_width == frame_width && prev_height == frame_height, - "Frame dimensions do not match expected dimensions: ", - prev_width, - "x", - prev_height, - " vs. ", - frame_width, - "x", - frame_height); - - const auto& next_dims = writer_configurations_[level]->dimensions; - const auto next_width = next_dims->width_dim().array_size_px; - const auto next_height = next_dims->height_dim().array_size_px; - const auto next_planes = - next_dims->at(next_dims->ndims() - 3).array_size_px; - - // only downsample if this level's XY size is smaller than the last - if (next_width < prev_width || next_height < prev_height) { - next_level_frame = - scale_fun_(current_frame, frame_width, frame_height, method_); - } else { - next_level_frame.assign(current_frame.begin(), - current_frame.end()); - } + // frame.with_lock([&](const auto& data) { + ByteVector current_frame(frame.begin(), frame.end()); + ByteVector next_level_frame; + + for (auto level = 1; level < n_levels_(); ++level) { + const auto& prev_dims = writer_configurations_[level - 1]->dimensions; + const auto prev_width = prev_dims->width_dim().array_size_px; + const auto prev_height = prev_dims->height_dim().array_size_px; + const auto prev_planes = + prev_dims->at(prev_dims->ndims() - 3).array_size_px; + + EXPECT(prev_width == frame_width && prev_height == frame_height, + "Frame dimensions do not match expected dimensions: ", + prev_width, + "x", + prev_height, + " vs. ", + frame_width, + "x", + frame_height); + + const auto& next_dims = writer_configurations_[level]->dimensions; + const auto next_width = next_dims->width_dim().array_size_px; + const auto next_height = next_dims->height_dim().array_size_px; + const auto next_planes = + next_dims->at(next_dims->ndims() - 3).array_size_px; + + // only downsample if this level's XY size is smaller than the last + if (next_width < prev_width || next_height < prev_height) { + next_level_frame = + scale_fun_(current_frame, frame_width, frame_height, method_); + } else { + next_level_frame.assign(current_frame.begin(), current_frame.end()); + } - EXPECT(next_width == frame_width && next_height == frame_height, - "Downsampled dimensions do not match expected dimensions: ", - next_width, - "x", - next_height, - " vs. ", - frame_width, - "x", - frame_height); - - // if the Z dimension is spatial, and has an odd number of planes, - // and this is the last plane, we don't want to queue it up to be - // averaged with the first frame of the next timepoint - bool average_this_frame = next_planes < prev_planes; - if (prev_planes % 2 != 0 && - level_frame_count_.at(level - 1) % prev_planes == 0) { - average_this_frame = false; - } + EXPECT(next_width == frame_width && next_height == frame_height, + "Downsampled dimensions do not match expected dimensions: ", + next_width, + "x", + next_height, + " vs. ", + frame_width, + "x", + frame_height); + + // if the Z dimension is spatial, and has an odd number of planes, + // and this is the last plane, we don't want to queue it up to be + // averaged with the first frame of the next timepoint + bool average_this_frame = next_planes < prev_planes; + if (prev_planes % 2 != 0 && + level_frame_count_.at(level - 1) % prev_planes == 0) { + average_this_frame = false; + } - // only average if this level's Z size is smaller than the last - // and if we are not at the last frame of the previous level - if (average_this_frame) { - auto it = partial_scaled_frames_.find(level); - if (it != partial_scaled_frames_.end()) { - // average2_fun_ writes to next_level_frame - // swap here so that decimate2 can take it->second - next_level_frame.swap(it->second); - average2_fun_(next_level_frame, it->second, method_); - emplace_downsampled_frame_(level, next_level_frame); - - // clean up this LOD - partial_scaled_frames_.erase(it); - - // set up for next iteration - if (level + 1 < writer_configurations_.size()) { - current_frame.assign(next_level_frame.begin(), - next_level_frame.end()); - } - } else { - partial_scaled_frames_.emplace(level, next_level_frame); - break; - } - } else { - // no downsampling in Z, so we can just pass the data to the - // next level + // only average if this level's Z size is smaller than the last + // and if we are not at the last frame of the previous level + if (average_this_frame) { + auto it = partial_scaled_frames_.find(level); + if (it != partial_scaled_frames_.end()) { + // average2_fun_ writes to next_level_frame + // swap here so that decimate2 can take it->second + next_level_frame.swap(it->second); + average2_fun_(next_level_frame, it->second, method_); emplace_downsampled_frame_(level, next_level_frame); + // clean up this LOD + partial_scaled_frames_.erase(it); + + // set up for next iteration if (level + 1 < writer_configurations_.size()) { current_frame.assign(next_level_frame.begin(), next_level_frame.end()); } + } else { + partial_scaled_frames_.emplace(level, next_level_frame); + break; + } + } else { + // no downsampling in Z, so we can just pass the data to the + // next level + emplace_downsampled_frame_(level, next_level_frame); + + if (level + 1 < writer_configurations_.size()) { + current_frame.assign(next_level_frame.begin(), + next_level_frame.end()); } } - }); + } + // }); } bool -zarr::Downsampler::take_frame(int level, LockedBuffer& frame_data) +zarr::Downsampler::take_frame(int level, std::vector& frame_data) { auto it = downsampled_frames_.find(level); if (it != downsampled_frames_.end()) { - frame_data.assign(it->second); + frame_data.assign(it->second.begin(), it->second.end()); downsampled_frames_.erase(level); return true; } diff --git a/src/streaming/downsampler.hh b/src/streaming/downsampler.hh index a90d1e5c..db6ba10f 100644 --- a/src/streaming/downsampler.hh +++ b/src/streaming/downsampler.hh @@ -21,7 +21,7 @@ class Downsampler * level, by calling take_frame(). * @param frame The full-resolution frame data. */ - void add_frame(LockedBuffer& frame); + void add_frame(std::vector& frame); /** * @brief Get the downsampled frame for the given level, removing it from @@ -32,7 +32,7 @@ class Downsampler * @param[out] frame_data The downsampled frame data. * @return True if the downsampled frame was found, false otherwise. */ - bool take_frame(int level, LockedBuffer& frame_data); + bool take_frame(int level, std::vector& frame_data); const std::unordered_map>& writer_configurations() const; diff --git a/src/streaming/frame.queue.cpp b/src/streaming/frame.queue.cpp index 06742e0a..81af22fd 100644 --- a/src/streaming/frame.queue.cpp +++ b/src/streaming/frame.queue.cpp @@ -19,7 +19,7 @@ zarr::FrameQueue::FrameQueue(size_t num_frames, size_t avg_frame_size) } bool -zarr::FrameQueue::push(LockedBuffer& frame, const std::string& key) +zarr::FrameQueue::push(std::vector& frame, const std::string& key) { std::unique_lock lock(mutex_); size_t write_pos = write_pos_.load(std::memory_order_relaxed); @@ -39,7 +39,7 @@ zarr::FrameQueue::push(LockedBuffer& frame, const std::string& key) } bool -zarr::FrameQueue::pop(LockedBuffer& frame, std::string& key) +zarr::FrameQueue::pop(std::vector& frame, std::string& key) { std::unique_lock lock(mutex_); size_t read_pos = read_pos_.load(std::memory_order_relaxed); diff --git a/src/streaming/frame.queue.hh b/src/streaming/frame.queue.hh index 07a02428..c725b225 100644 --- a/src/streaming/frame.queue.hh +++ b/src/streaming/frame.queue.hh @@ -1,7 +1,6 @@ #pragma once #include "definitions.hh" -#include "locked.buffer.hh" #include #include @@ -16,8 +15,8 @@ class FrameQueue explicit FrameQueue(size_t num_frames, size_t avg_frame_size); ~FrameQueue() = default; - bool push(LockedBuffer& frame, const std::string& key); - bool pop(LockedBuffer& frame, std::string& key); + bool push(std::vector& frame, const std::string& key); + bool pop(std::vector& frame, std::string& key); size_t size() const; size_t bytes_used() const; @@ -29,7 +28,7 @@ class FrameQueue struct Frame { std::string key; - LockedBuffer data; + std::vector data; std::atomic ready{ false }; }; diff --git a/src/streaming/locked.buffer.cpp b/src/streaming/locked.buffer.cpp deleted file mode 100644 index 59544775..00000000 --- a/src/streaming/locked.buffer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#include "locked.buffer.hh" -#include "macros.hh" - -#include - -zarr::LockedBuffer::LockedBuffer(std::vector&& data) - : data_(std::move(data)) -{ -} - -zarr::LockedBuffer::LockedBuffer(zarr::LockedBuffer&& other) noexcept - : data_(std::move(other.data_)) -{ -} - -zarr::LockedBuffer& -zarr::LockedBuffer::operator=(zarr::LockedBuffer&& other) noexcept -{ - if (this != &other) { - std::unique_lock lock1(mutex_, std::defer_lock); - std::unique_lock lock2(other.mutex_, std::defer_lock); - std::lock(lock1, lock2); // avoid deadlock - - data_ = std::move(other.data_); - } - - return *this; -} - -void -zarr::LockedBuffer::resize(size_t n) -{ - std::unique_lock lock(mutex_); - data_.resize(n); -} - -void -zarr::LockedBuffer::resize_and_fill(size_t n, uint8_t value) -{ - std::unique_lock lock(mutex_); - - data_.resize(n, value); - std::fill(data_.begin(), data_.end(), value); -} - -size_t -zarr::LockedBuffer::size() const -{ - std::unique_lock lock(mutex_); - return data_.size(); -} - -void -zarr::LockedBuffer::assign(ConstByteSpan data) -{ - std::unique_lock lock(mutex_); - data_.assign(data.begin(), data.end()); -} - -void -zarr::LockedBuffer::assign(ByteVector&& data) -{ - std::unique_lock lock(mutex_); - data_ = std::move(data); -} - -void -zarr::LockedBuffer::assign_at(size_t offset, ConstByteSpan data) -{ - std::unique_lock lock(mutex_); - if (offset + data.size() > data_.size()) { - data_.resize(offset + data.size()); - } - std::copy(data.begin(), data.end(), data_.begin() + offset); -} - -void -zarr::LockedBuffer::swap(zarr::LockedBuffer& other) -{ - std::unique_lock lock(mutex_); - other.with_lock([this](ByteVector& other_data) { data_.swap(other_data); }); -} - -void -zarr::LockedBuffer::clear() -{ - std::unique_lock lock(mutex_); - data_.clear(); -} - -std::vector -zarr::LockedBuffer::take() -{ - std::unique_lock lock(mutex_); - std::vector result = std::move(data_); - data_ = std::vector{}; // Fresh empty vector - return result; -} - -bool -zarr::LockedBuffer::compress(const zarr::BloscCompressionParams& params, - size_t type_size) -{ - std::unique_lock lock(mutex_); - if (data_.empty()) { - LOG_WARNING("Buffer is empty, not compressing."); - return false; - } - - std::vector compressed_data(data_.size() + BLOSC_MAX_OVERHEAD); - const auto n_bytes_compressed = blosc_compress_ctx(params.clevel, - params.shuffle, - type_size, - data_.size(), - data_.data(), - compressed_data.data(), - compressed_data.size(), - params.codec_id.c_str(), - 0, - 1); - - if (n_bytes_compressed <= 0) { - LOG_ERROR("blosc_compress_ctx failed with code ", n_bytes_compressed); - return false; - } - - compressed_data.resize(n_bytes_compressed); - data_ = compressed_data; - return true; -} \ No newline at end of file diff --git a/src/streaming/locked.buffer.hh b/src/streaming/locked.buffer.hh deleted file mode 100644 index 20f343dd..00000000 --- a/src/streaming/locked.buffer.hh +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include "blosc.compression.params.hh" -#include "definitions.hh" - -#include -#include - -namespace zarr { -class LockedBuffer -{ - private: - mutable std::mutex mutex_; - std::vector data_; - - public: - LockedBuffer() = default; - LockedBuffer(std::vector&& data); - - LockedBuffer(const LockedBuffer& other) = delete; - LockedBuffer(LockedBuffer&& other) noexcept; - - LockedBuffer& operator=(const LockedBuffer&) = delete; - LockedBuffer& operator=(LockedBuffer&& other) noexcept; - - template - auto with_lock(F&& fun) -> decltype(fun(data_)) - { - std::unique_lock lock(mutex_); - return fun(data_); - } - - /** - * @brief Resize the buffer to @p n bytes, but keep existing data. - * @param n New size of the buffer. - */ - void resize(size_t n); - - /** - * @brief Resize the buffer to @p n bytes, filling new bytes with @p value. - * @param n New size of the buffer. - * @param value Value to fill new bytes with. - */ - void resize_and_fill(size_t n, uint8_t value); - - /** - * @brief Get the current size of the buffer. - * @return Size of the buffer in bytes. - */ - size_t size() const; - - /** - * @brief Assign new data to the buffer, replacing existing data. - * @param data Data to assign to the buffer. - */ - void assign(ConstByteSpan data); - - /** - * @brief Assign new data to the buffer, replacing existing data. - * @note Moves the data - * @param data Data to assign to the buffer. - */ - void assign(ByteVector&& data); - - /** - * @brief Assign new data to the buffer at offset @p offset, replacing - * existing data. - * @param offset - * @param data - */ - void assign_at(size_t offset, ConstByteSpan data); - - /** - * @brief Swap the contents of this buffer with another. - * @param other The other LockedBuffer to swap with. - */ - void swap(LockedBuffer& other); - - /** - * @brief Clear the buffer, removing all data. - */ - void clear(); - - /** - * @brief Take the contents of the buffer, leaving it empty. - * @return The contents of the buffer. - */ - std::vector take(); - - /** - * @brief Compress the buffer in place using Blosc with the given parameters. - * @param params Compression parameters. - * @param type_size Size of the data type being compressed (e.g., 1 for uint8, 2 for uint16). - * @return true if compression was successful, false otherwise. - */ - [[nodiscard]] bool compress(const zarr::BloscCompressionParams& params, - size_t type_size); -}; -} // namespace zarr \ No newline at end of file diff --git a/src/streaming/multiscale.array.cpp b/src/streaming/multiscale.array.cpp index 977d08d6..2afeadaf 100644 --- a/src/streaming/multiscale.array.cpp +++ b/src/streaming/multiscale.array.cpp @@ -44,7 +44,7 @@ zarr::MultiscaleArray::memory_usage() const noexcept } size_t -zarr::MultiscaleArray::write_frame(LockedBuffer& data) +zarr::MultiscaleArray::write_frame(std::vector& data) { if (arrays_.empty()) { LOG_WARNING("Attempt to write to group with no arrays"); @@ -233,7 +233,7 @@ zarr::MultiscaleArray::make_base_array_config_() const } void -zarr::MultiscaleArray::write_multiscale_frames_(LockedBuffer& data) +zarr::MultiscaleArray::write_multiscale_frames_(std::vector& data) { if (!downsampler_) { return; // no downsampler, nothing to do @@ -242,7 +242,7 @@ zarr::MultiscaleArray::write_multiscale_frames_(LockedBuffer& data) downsampler_->add_frame(data); for (auto i = 1; i < arrays_.size(); ++i) { - LockedBuffer downsampled_frame; + std::vector downsampled_frame; if (downsampler_->take_frame(i, downsampled_frame)) { const auto n_bytes = arrays_[i]->write_frame(downsampled_frame); EXPECT(n_bytes == downsampled_frame.size(), diff --git a/src/streaming/multiscale.array.hh b/src/streaming/multiscale.array.hh index a019a90a..0afe6ed0 100644 --- a/src/streaming/multiscale.array.hh +++ b/src/streaming/multiscale.array.hh @@ -25,7 +25,7 @@ class MultiscaleArray : public ArrayBase * @param data The frame data to write. * @return The number of bytes written of the full-resolution frame. */ - [[nodiscard]] size_t write_frame(LockedBuffer& data) override; + [[nodiscard]] size_t write_frame(std::vector& data) override; protected: std::unique_ptr downsampler_; @@ -63,6 +63,6 @@ class MultiscaleArray : public ArrayBase * resolution arrays. * @param data The frame data to write. */ - void write_multiscale_frames_(LockedBuffer& data); + void write_multiscale_frames_(std::vector& data); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/zarr.stream.cpp b/src/streaming/zarr.stream.cpp index 9a7c7216..04c977ea 100644 --- a/src/streaming/zarr.stream.cpp +++ b/src/streaming/zarr.stream.cpp @@ -866,8 +866,9 @@ ZarrStream::append(const char* key_, const void* data_, size_t nbytes) const size_t bytes_to_copy = std::min(bytes_of_frame - frame_buffer_offset, bytes_remaining); - frame_buffer.assign_at(frame_buffer_offset, - { data + bytes_written, bytes_to_copy }); + memcpy(frame_buffer.data() + frame_buffer_offset, + data + bytes_written, + bytes_to_copy); frame_buffer_offset += bytes_to_copy; bytes_written += bytes_to_copy; @@ -890,12 +891,12 @@ ZarrStream::append(const char* key_, const void* data_, size_t nbytes) frame_buffer_offset = 0; } } else if (bytes_remaining < bytes_of_frame) { // begin partial frame - frame_buffer.assign_at(0, { data, bytes_remaining }); + memcpy(frame_buffer.data(), data, bytes_remaining); frame_buffer_offset = bytes_remaining; bytes_written += bytes_remaining; } else { // at least one full frame - zarr::LockedBuffer frame; - frame.assign({ data, bytes_of_frame }); + std::vector frame(bytes_of_frame); + frame.assign(data, data + bytes_of_frame); std::unique_lock lock(frame_queue_mutex_); while (!frame_queue_->push(frame, key) && process_frames_) { @@ -1178,7 +1179,8 @@ ZarrStream_s::configure_array_(const ZarrArraySettings* settings, dims->height_dim().array_size_px * zarr::bytes_of_type(settings->data_type); - output_node.frame_buffer.resize_and_fill(frame_size_bytes, 0); + output_node.frame_buffer.resize(frame_size_bytes); + std::ranges::fill(output_node.frame_buffer, 0); output_arrays_.emplace(output_node.output_key, std::move(output_node)); return true; @@ -1530,7 +1532,7 @@ ZarrStream_s::process_frame_queue_() std::string output_key; - zarr::LockedBuffer frame; + std::vector frame; while (process_frames_) { { std::unique_lock lock(frame_queue_mutex_); diff --git a/src/streaming/zarr.stream.hh b/src/streaming/zarr.stream.hh index 59fd0186..66e153c5 100644 --- a/src/streaming/zarr.stream.hh +++ b/src/streaming/zarr.stream.hh @@ -4,7 +4,6 @@ #include "array.dimensions.hh" #include "file.handle.hh" #include "frame.queue.hh" -#include "locked.buffer.hh" #include "plate.hh" #include "s3.connection.hh" #include "thread.pool.hh" @@ -50,7 +49,7 @@ struct ZarrStream_s struct ZarrOutputArray { std::string output_key; - zarr::LockedBuffer frame_buffer; + std::vector frame_buffer; size_t frame_buffer_offset; std::unique_ptr array; }; diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index fd138d98..1b072380 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -148,7 +148,7 @@ main() config, thread_pool, std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); + std::vector data(frame_size, 0); for (auto i = 0; i < n_frames; ++i) { // 2 time points CHECK(writer->write_frame(data)); diff --git a/tests/unit-tests/array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp index 09007a64..0ac55ac2 100644 --- a/tests/unit-tests/array-write-ragged-append-dim.cpp +++ b/tests/unit-tests/array-write-ragged-append-dim.cpp @@ -118,7 +118,7 @@ main() config, thread_pool, std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); + std::vector data(frame_size, 0); for (auto i = 0; i < n_frames; ++i) { // 2 time points CHECK(writer->write_frame(data)); diff --git a/tests/unit-tests/array-write-ragged-internal-dim.cpp b/tests/unit-tests/array-write-ragged-internal-dim.cpp index 805aaccf..c0cee78c 100644 --- a/tests/unit-tests/array-write-ragged-internal-dim.cpp +++ b/tests/unit-tests/array-write-ragged-internal-dim.cpp @@ -137,7 +137,7 @@ main() std::make_shared()); const size_t frame_size = array_width * array_height * nbytes_px; - zarr::LockedBuffer data(std::move(ByteVector(frame_size, 0))); + std::vector data(frame_size, 0); for (auto i = 0; i < n_frames; ++i) { // 2 time points CHECK(writer->write_frame(data)); diff --git a/tests/unit-tests/downsampler-odd-z.cpp b/tests/unit-tests/downsampler-odd-z.cpp index 588bed79..25414a01 100644 --- a/tests/unit-tests/downsampler-odd-z.cpp +++ b/tests/unit-tests/downsampler-odd-z.cpp @@ -3,7 +3,7 @@ namespace { template -zarr::LockedBuffer +std::vector create_test_image(size_t width, size_t height, T value = 100) { ByteVector data(width * height * sizeof(T), 0); @@ -13,7 +13,7 @@ create_test_image(size_t width, size_t height, T value = 100) typed_data[i] = value; } - return { std::move(data) }; + return data; } void @@ -25,14 +25,14 @@ check_downsample(zarr::Downsampler& downsampler, uint8_t frame_value) for (auto i = 0; i < 15; ++i) { downsampler.add_frame(first_timepoint); if (i % 2 == 1) { - zarr::LockedBuffer downsampled; + std::vector downsampled; EXPECT(downsampler.take_frame(1, downsampled), "Downsampled frame not found"); ++n_downsampled; - downsampled.with_lock([frame_value](const ByteVector& data) { - for (auto j = 0; j < data.size(); ++j) { - auto value = data[j]; + { + for (auto j = 0; j < downsampled.size(); ++j) { + auto value = downsampled[j]; EXPECT(value == frame_value, "Downsampled value mismatch at timepoint ", j, @@ -41,20 +41,20 @@ check_downsample(zarr::Downsampler& downsampler, uint8_t frame_value) ", got ", value); } - }); + } } } EXPECT( n_downsampled == 7, "Expected 7 downsampled frames, got ", n_downsampled); - zarr::LockedBuffer downsampled; + std::vector downsampled; EXPECT(downsampler.take_frame(1, downsampled), "Downsampled frame not found after all frames added"); - downsampled.with_lock([frame_value](const ByteVector& data) { - for (auto j = 0; j < data.size(); ++j) { - auto value = data[j]; + { + for (auto j = 0; j < downsampled.size(); ++j) { + auto value = downsampled[j]; EXPECT(value == frame_value, "Downsampled value mismatch at timepoint ", j, @@ -63,7 +63,7 @@ check_downsample(zarr::Downsampler& downsampler, uint8_t frame_value) ", got ", value); } - }); + } } } // namespace @@ -80,14 +80,15 @@ main() { "x", ZarrDimensionType_Space, 64, 16, 1 }, }, ZarrDataType_uint8); - auto config = std::make_shared("", - "/0", - std::nullopt, - std::nullopt, - dims, - ZarrDataType_uint8, - ZarrDownsamplingMethod_Mean, - 0); + auto config = + std::make_shared("", + "/0", + std::nullopt, + std::nullopt, + dims, + ZarrDataType_uint8, + ZarrDownsamplingMethod_Mean, + 0); try { zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); diff --git a/tests/unit-tests/downsampler.cpp b/tests/unit-tests/downsampler.cpp index caf0c3dd..7b89b4fa 100644 --- a/tests/unit-tests/downsampler.cpp +++ b/tests/unit-tests/downsampler.cpp @@ -9,7 +9,7 @@ namespace { // Helper to create simple test images template -zarr::LockedBuffer +std::vector create_test_image(size_t width, size_t height, T value = 100) { ByteVector data(width * height * sizeof(T), 0); @@ -55,7 +55,7 @@ test_basic_downsampling() // Add the frame and check that downsampled version is created downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Downsampled frame not found"); @@ -64,11 +64,9 @@ test_basic_downsampling() // Verify the downsampled values (should still be 100 since all input pixels // were 100) - downsampled.with_lock([](auto& data) { - for (size_t i = 0; i < 5 * 5; ++i) { - EXPECT_EQ(uint8_t, data[i], 100); - } - }); + for (size_t i = 0; i < 5 * 5; ++i) { + EXPECT_EQ(uint8_t, downsampled[i], 100); + } // Check frame is removed from cache after retrieval has_frame = downsampler.take_frame(1, downsampled); @@ -108,7 +106,7 @@ test_3d_downsampling() // Add first frame - should be stored in partial_scaled_frames_ downsampler.add_frame(image1); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(!has_frame, "Downsampled frame should not be ready yet in 3D mode"); @@ -120,12 +118,10 @@ test_3d_downsampling() EXPECT(has_frame, "Downsampled frame not found after second frame"); // Verify the values (should be average of 100 and 200 = 150) - downsampled.with_lock([](auto& data) { - auto* typed_downsampled = reinterpret_cast(data.data()); - for (size_t i = 0; i < 10 * 10; ++i) { - EXPECT_EQ(uint16_t, typed_downsampled[i], 150); - } - }); + auto* typed_downsampled = reinterpret_cast(downsampled.data()); + for (size_t i = 0; i < 10 * 10; ++i) { + EXPECT_EQ(uint16_t, typed_downsampled[i], 150); + } // second level shouldn't be ready yet has_frame = downsampler.take_frame(2, downsampled); @@ -149,12 +145,10 @@ test_3d_downsampling() EXPECT(has_frame, "Downsampled frame not found after fourth frame"); // Verify the values (should be average of 100, 200, 300, and 400 = 250) - downsampled.with_lock([](auto& data) { - auto* typed_downsampled = reinterpret_cast(data.data()); - for (size_t i = 0; i < 5 * 5; ++i) { - EXPECT_EQ(uint16_t, typed_downsampled[i], 250); - } - }); + typed_downsampled = reinterpret_cast(downsampled.data()); + for (size_t i = 0; i < 5 * 5; ++i) { + EXPECT_EQ(uint16_t, typed_downsampled[i], 250); + } } void @@ -191,7 +185,7 @@ test_data_types() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); // Add a frame based on the type - zarr::LockedBuffer image; + std::vector image; size_t pixel_size = 0; switch (type) { @@ -242,7 +236,7 @@ test_data_types() downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Downsampled frame not found for type " + @@ -439,10 +433,10 @@ test_edge_cases() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); // Create a test image (11x11) - zarr::LockedBuffer image(std::move(ByteVector(11 * 11, 100))); + std::vector image(std::move(ByteVector(11 * 11, 100))); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Downsampled frame not found for odd dimensions"); @@ -472,10 +466,10 @@ test_min_max_downsampling() // Create a test image with a pattern that will show different results for // min/max/mean - zarr::LockedBuffer image( + std::vector image( std::move(ByteVector(10 * 10 * sizeof(uint8_t), 0))); - image.with_lock([](auto& data) { - auto* typed_data = reinterpret_cast(data.data()); + { + auto* typed_data = reinterpret_cast(image.data()); // Create a pattern where each 2x2 block has values [100, 200, 150, 250] for (size_t y = 0; y < 10; y += 2) { @@ -486,23 +480,23 @@ test_min_max_downsampling() typed_data[(y + 1) * 10 + (x + 1)] = 250; // bottom-right } } - }); + } // Test with mean downsampling { zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Mean downsampled frame not found"); - downsampled.with_lock([](auto& data) { + { // For mean, we expect (100 + 200 + 150 + 250) / 4 = 175 for (size_t i = 0; i < 5 * 5; ++i) { - EXPECT_EQ(uint8_t, data[i], 175); + EXPECT_EQ(uint8_t, downsampled[i], 175); } - }); + } } // Test with min downsampling @@ -511,16 +505,16 @@ test_min_max_downsampling() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Min); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Min downsampled frame not found"); - downsampled.with_lock([](auto& data) { + { // For min, we expect min(100, 200, 150, 250) = 100 for (size_t i = 0; i < 5 * 5; ++i) { - EXPECT_EQ(uint8_t, data[i], 100); + EXPECT_EQ(uint8_t, downsampled[i], 100); } - }); + } } // Test with max downsampling @@ -529,16 +523,16 @@ test_min_max_downsampling() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Max); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Max downsampled frame not found"); - downsampled.with_lock([](auto& data) { + { // For max, we expect max(100, 200, 150, 250) = 250 for (size_t i = 0; i < 5 * 5; ++i) { - EXPECT_EQ(uint8_t, data[i], 250); + EXPECT_EQ(uint8_t, downsampled[i], 250); } - }); + } } } @@ -575,17 +569,18 @@ test_3d_min_max_downsampling() downsampler.add_frame(image1); downsampler.add_frame(image2); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Min downsampled frame not found after second frame"); - downsampled.with_lock([](auto& data) { + { // Verify the values (should be min of 100 and 200 = 100) - auto* typed_downsampled = reinterpret_cast(data.data()); + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 10 * 10; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], 100); } - }); + } } // Test with max downsampling @@ -600,17 +595,18 @@ test_3d_min_max_downsampling() downsampler.add_frame(image1); downsampler.add_frame(image2); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Max downsampled frame not found after second frame"); - downsampled.with_lock([](auto& data) { + { // Verify the values (should be max of 100 and 200 = 200) - auto* typed_downsampled = reinterpret_cast(data.data()); + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 10 * 10; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], 200); } - }); + } } // Test multi-level downsampling with max @@ -627,17 +623,18 @@ test_3d_min_max_downsampling() downsampler.add_frame(image3); downsampler.add_frame(image4); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(2, downsampled); EXPECT(has_frame, "Level 2 max downsampled frame not found"); - downsampled.with_lock([](auto& data) { + { // Verify the values (should be max of all values = 400) - auto* typed_downsampled = reinterpret_cast(data.data()); + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 5 * 5; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], 400); } - }); + } } } @@ -662,14 +659,14 @@ test_pattern_downsampling() 0); // Create a test image with a gradient pattern - zarr::LockedBuffer image( + std::vector image( std::move(ByteVector(8 * 8 * sizeof(uint16_t), 0))); std::vector expected_mean(4 * 4); std::vector expected_min(4 * 4); std::vector expected_max(4 * 4); - image.with_lock([&](auto& data) { - auto* typed_data = reinterpret_cast(data.data()); + { + auto* typed_data = reinterpret_cast(image.data()); // Values increase from left to right and top to bottom for (size_t y = 0; y < 8; ++y) { @@ -698,23 +695,24 @@ test_pattern_downsampling() std::max(std::max(v1, v2), std::max(v3, v4)); } } - }); + } // Test with mean downsampling { zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Mean downsampled frame not found"); - downsampled.with_lock([&expected_mean](auto& data) { - auto* typed_downsampled = reinterpret_cast(data.data()); + { + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 4 * 4; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], expected_mean[i]); } - }); + } } // Test with min downsampling @@ -723,16 +721,17 @@ test_pattern_downsampling() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Min); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Min downsampled frame not found"); - downsampled.with_lock([&expected_min](auto& data) { - auto* typed_downsampled = reinterpret_cast(data.data()); + { + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 4 * 4; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], expected_min[i]); } - }); + } } // Test with max downsampling @@ -741,16 +740,17 @@ test_pattern_downsampling() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Max); downsampler.add_frame(image); - zarr::LockedBuffer downsampled; + std::vector downsampled; bool has_frame = downsampler.take_frame(1, downsampled); EXPECT(has_frame, "Min downsampled frame not found"); - downsampled.with_lock([&expected_max](auto& data) { - auto* typed_downsampled = reinterpret_cast(data.data()); + { + auto* typed_downsampled = + reinterpret_cast(downsampled.data()); for (size_t i = 0; i < 4 * 4; ++i) { EXPECT_EQ(uint16_t, typed_downsampled[i], expected_max[i]); } - }); + } } } } // namespace zarr::test diff --git a/tests/unit-tests/frame-queue.cpp b/tests/unit-tests/frame-queue.cpp index e3d1240c..0faee446 100644 --- a/tests/unit-tests/frame-queue.cpp +++ b/tests/unit-tests/frame-queue.cpp @@ -20,7 +20,7 @@ test_basic_operations() for (size_t i = 0; i < data.size(); ++i) { data[i] = i % 256; } - zarr::LockedBuffer frame(std::move(data)); + std::vector frame(std::move(data)); // Pushing CHECK(queue.push(frame, "foo")); @@ -28,7 +28,7 @@ test_basic_operations() CHECK(!queue.empty()); // Popping - zarr::LockedBuffer received_frame; + std::vector received_frame; std::string received_key; CHECK(queue.pop(received_frame, received_key)); CHECK(received_frame.size() == 1024); @@ -36,11 +36,11 @@ test_basic_operations() CHECK(queue.empty()); // Verify data - received_frame.with_lock([](auto& data) { - for (size_t i = 0; i < data.size(); ++i) { - CHECK(data[i] == i % 256); + { + for (size_t i = 0; i < received_frame.size(); ++i) { + CHECK(received_frame[i] == i % 256); } - }); + } CHECK(received_key == "foo"); } @@ -52,19 +52,19 @@ test_capacity() // Fill the queue for (size_t i = 0; i < capacity; ++i) { - zarr::LockedBuffer frame(std::move(ByteVector(100, i))); + std::vector frame(std::move(ByteVector(100, i))); bool result = queue.push(frame, std::to_string(i)); CHECK(result); } // Queue should be full (next push should fail) - zarr::LockedBuffer extra_frame(std::move(ByteVector(100))); + std::vector extra_frame(std::move(ByteVector(100))); bool push_result = queue.push(extra_frame, std::to_string(capacity)); CHECK(!push_result); CHECK(queue.size() == capacity); // Remove one item - zarr::LockedBuffer received_frame; + std::vector received_frame; std::string received_key; bool pop_result = queue.pop(received_frame, received_key); CHECK(pop_result); @@ -72,7 +72,7 @@ test_capacity() CHECK(received_key == "0"); // Should be able to push again - zarr::LockedBuffer new_frame(std::move(ByteVector(100, 99))); + std::vector new_frame(std::move(ByteVector(100, 99))); push_result = queue.push(new_frame, std::to_string(capacity)); CHECK(push_result); CHECK(queue.size() == capacity); @@ -91,7 +91,7 @@ test_producer_consumer() // Producer thread std::thread producer([&queue, n_frames, frame_size]() { for (size_t i = 0; i < n_frames; ++i) { - zarr::LockedBuffer frame( + std::vector frame( std::move(ByteVector(frame_size, i % 256))); // Try until successful @@ -106,15 +106,13 @@ test_producer_consumer() size_t frames_received = 0; while (frames_received < n_frames) { - zarr::LockedBuffer frame; + std::vector frame; std::string received_key; if (queue.pop(frame, received_key)) { // Verify frame data (first byte should match frame number % // 256) CHECK(frame.size() > 0); - CHECK(frame.with_lock([&frames_received](auto& data) { - return data[0] == (frames_received % 256); - })); + CHECK(frame[0] == frames_received % 256); CHECK(received_key == "spam"); frames_received++; } else { @@ -142,20 +140,21 @@ test_throughput() // Create large frame for testing std::vector large_frame(frame_size, 42); - zarr::LockedBuffer data(std::move(ByteVector(large_frame))); + std::vector data(std::move(ByteVector(large_frame))); auto start_time = std::chrono::high_resolution_clock::now(); // Push and pop in a loop const size_t iterations = 100; - zarr::LockedBuffer received_frame; + std::vector received_frame; std::string received_key; for (size_t i = 0; i < iterations; ++i) { CHECK(queue.push(data, std::to_string(i))); CHECK(queue.pop(received_frame, received_key)); CHECK(received_frame.size() == frame_size); CHECK(received_key == std::to_string(i)); - data.assign(ByteVector(frame_size, 42)); // Reuse the buffer + data.resize(frame_size); + std::ranges::fill(data, 42); // reset data for next push } auto end_time = std::chrono::high_resolution_clock::now(); From 2c89501f1a0a8e271f031bed1679ce7d4fc2f299 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 18:09:16 +0200 Subject: [PATCH 20/38] Add overwrite flag to Python benchmark --- benchmarks/benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index d72be017..a71bf927 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -141,7 +141,8 @@ def run_acquire_zarr_test( ], data_type=aqz.DataType.UINT16, ) - ] + ], + overwrite=True, ) # Create a ZarrStream for appending frames. From 3dee8c63b350d3e3a3163fe0f9fb6c3f0a295e28 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 18:49:37 +0200 Subject: [PATCH 21/38] Be consistent in vector/span defs --- src/streaming/array.hh | 1 - src/streaming/definitions.hh | 10 ---------- src/streaming/downsampler.cpp | 20 ++++++++++---------- src/streaming/downsampler.hh | 17 +++++++++-------- src/streaming/file.handle.cpp | 1 - src/streaming/frame.queue.hh | 2 -- src/streaming/fs.array.cpp | 3 ++- src/streaming/fs.storage.cpp | 2 +- src/streaming/posix/platform.cpp | 4 ++-- src/streaming/s3.object.cpp | 2 +- src/streaming/s3.object.hh | 3 +-- src/streaming/win32/platform.cpp | 7 ++++--- src/streaming/zarr.common.cpp | 2 +- src/streaming/zarr.common.hh | 3 +-- tests/unit-tests/downsampler-odd-z.cpp | 2 +- tests/unit-tests/downsampler.cpp | 8 ++++---- tests/unit-tests/frame-queue.cpp | 12 ++++++------ 17 files changed, 43 insertions(+), 56 deletions(-) delete mode 100644 src/streaming/definitions.hh diff --git a/src/streaming/array.hh b/src/streaming/array.hh index f6820674..db12df52 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -1,7 +1,6 @@ #pragma once #include "array.base.hh" -#include "definitions.hh" #include "thread.pool.hh" namespace zarr { diff --git a/src/streaming/definitions.hh b/src/streaming/definitions.hh deleted file mode 100644 index c9b7ed1e..00000000 --- a/src/streaming/definitions.hh +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include // uint8_t -#include -#include - -using ByteVector = std::vector; - -using ByteSpan = std::span; -using ConstByteSpan = std::span; diff --git a/src/streaming/downsampler.cpp b/src/streaming/downsampler.cpp index 4d9698d9..8150c567 100644 --- a/src/streaming/downsampler.cpp +++ b/src/streaming/downsampler.cpp @@ -136,8 +136,8 @@ max2(const T& a, const T& b) } template -[[nodiscard]] ByteVector -scale_image(ConstByteSpan src, +[[nodiscard]] std::vector +scale_image(std::span src, size_t& width, size_t& height, ZarrDownsamplingMethod method) @@ -177,7 +177,7 @@ scale_image(ConstByteSpan src, const auto h_pad = height + (height % downscale); const auto size_downscaled = w_pad * h_pad * bytes_of_type / factor; - ByteVector dst(size_downscaled, 0); + std::vector dst(size_downscaled, 0); auto* dst_as_T = reinterpret_cast(dst.data()); auto* src_as_T = reinterpret_cast(src.data()); @@ -206,8 +206,8 @@ scale_image(ConstByteSpan src, template void -average_two_frames(ByteVector& dst, - ConstByteSpan src, +average_two_frames(std::vector& dst, + std::span src, ZarrDownsamplingMethod method) { T (*average_fun)(const T&, const T&) = nullptr; @@ -310,8 +310,8 @@ zarr::Downsampler::add_frame(std::vector& frame) size_t frame_height = base_dims->height_dim().array_size_px; // frame.with_lock([&](const auto& data) { - ByteVector current_frame(frame.begin(), frame.end()); - ByteVector next_level_frame; + std::vector current_frame(frame.begin(), frame.end()); + std::vector next_level_frame; for (auto level = 1; level < n_levels_(); ++level) { const auto& prev_dims = writer_configurations_[level - 1]->dimensions; @@ -397,7 +397,6 @@ zarr::Downsampler::add_frame(std::vector& frame) } } } - // }); } bool @@ -593,8 +592,9 @@ zarr::Downsampler::make_writer_configurations_( } void -zarr::Downsampler::emplace_downsampled_frame_(int level, - const ByteVector& frame_data) +zarr::Downsampler::emplace_downsampled_frame_( + int level, + const std::vector& frame_data) { downsampled_frames_.emplace(level, frame_data); ++level_frame_count_.at(level); diff --git a/src/streaming/downsampler.hh b/src/streaming/downsampler.hh index db6ba10f..d43b0947 100644 --- a/src/streaming/downsampler.hh +++ b/src/streaming/downsampler.hh @@ -2,7 +2,6 @@ #include "array.hh" #include "array.dimensions.hh" -#include "definitions.hh" #include "nlohmann/json.hpp" @@ -41,10 +40,12 @@ class Downsampler nlohmann::json get_metadata() const; private: - using ScaleFunT = std::function< - ByteVector(ConstByteSpan, size_t&, size_t&, ZarrDownsamplingMethod)>; - using Average2FunT = - std::function; + using ScaleFunT = std::function(std::span, + size_t&, + size_t&, + ZarrDownsamplingMethod)>; + using Average2FunT = std::function< + void(std::vector&, std::span, ZarrDownsamplingMethod)>; ZarrDownsamplingMethod method_; @@ -53,13 +54,13 @@ class Downsampler std::unordered_map> writer_configurations_; - std::unordered_map downsampled_frames_; - std::unordered_map partial_scaled_frames_; + std::unordered_map> downsampled_frames_; + std::unordered_map> partial_scaled_frames_; std::unordered_map level_frame_count_; size_t n_levels_() const; void make_writer_configurations_(std::shared_ptr config); - void emplace_downsampled_frame_(int level, const ByteVector& frame_data); + void emplace_downsampled_frame_(int level, const std::vector& frame_data); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/file.handle.cpp b/src/streaming/file.handle.cpp index 8ddcbd08..0f7eab03 100644 --- a/src/streaming/file.handle.cpp +++ b/src/streaming/file.handle.cpp @@ -1,4 +1,3 @@ -#include "definitions.hh" #include "file.handle.hh" #include "macros.hh" diff --git a/src/streaming/frame.queue.hh b/src/streaming/frame.queue.hh index c725b225..c17370ed 100644 --- a/src/streaming/frame.queue.hh +++ b/src/streaming/frame.queue.hh @@ -1,7 +1,5 @@ #pragma once -#include "definitions.hh" - #include #include #include diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index f343059e..5715545e 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -6,6 +6,7 @@ #include // memcp #include #include +#include #include void* @@ -15,7 +16,7 @@ void destroy_flags(void* flags); bool -seek_and_write(void* handle, size_t offset, ConstByteSpan data); +seek_and_write(void* handle, size_t offset, std::span data); namespace fs = std::filesystem; diff --git a/src/streaming/fs.storage.cpp b/src/streaming/fs.storage.cpp index 3e58245a..c229d30e 100644 --- a/src/streaming/fs.storage.cpp +++ b/src/streaming/fs.storage.cpp @@ -10,7 +10,7 @@ void destroy_flags(void* flags); bool -seek_and_write(void* handle, size_t offset, ConstByteSpan data); +seek_and_write(void* handle, size_t offset, std::span data); zarr::FSStorage::FSStorage(std::shared_ptr file_handle_pool) : file_handle_pool_(file_handle_pool) diff --git a/src/streaming/posix/platform.cpp b/src/streaming/posix/platform.cpp index cfe9bca5..640279be 100644 --- a/src/streaming/posix/platform.cpp +++ b/src/streaming/posix/platform.cpp @@ -1,6 +1,6 @@ -#include "definitions.hh" #include "macros.hh" +#include #include #include @@ -57,7 +57,7 @@ init_handle(const std::string& filename, void* flags) } bool -seek_and_write(void* handle, size_t offset, ConstByteSpan data) +seek_and_write(void* handle, size_t offset, std::span data) { CHECK(handle); const auto* fd = static_cast(handle); diff --git a/src/streaming/s3.object.cpp b/src/streaming/s3.object.cpp index 6ffd212c..2be63498 100644 --- a/src/streaming/s3.object.cpp +++ b/src/streaming/s3.object.cpp @@ -21,7 +21,7 @@ zarr::S3Object::S3Object(std::string_view bucket_name, } bool -zarr::S3Object::write(ConstByteSpan data, size_t offset) +zarr::S3Object::write(std::span data, size_t offset) { if (is_closed_) { LOG_ERROR("Cannot write to closed stream"); diff --git a/src/streaming/s3.object.hh b/src/streaming/s3.object.hh index e804aaed..38b34501 100644 --- a/src/streaming/s3.object.hh +++ b/src/streaming/s3.object.hh @@ -1,6 +1,5 @@ #pragma once -#include "definitions.hh" #include "s3.connection.hh" #include @@ -20,7 +19,7 @@ class S3Object * @param offset The offset to write at. * @return True if the write was successful, false otherwise. */ - [[nodiscard]] bool write(ConstByteSpan data, size_t offset); + [[nodiscard]] bool write(std::span data, size_t offset); /** * @brief Close the object, flushing any remaining data. diff --git a/src/streaming/win32/platform.cpp b/src/streaming/win32/platform.cpp index 1d033727..23de41e1 100644 --- a/src/streaming/win32/platform.cpp +++ b/src/streaming/win32/platform.cpp @@ -1,6 +1,6 @@ -#include "definitions.hh" #include "macros.hh" +#include #include #include @@ -77,7 +77,7 @@ init_handle(const std::string& filename, void* flags) } bool -seek_and_write(void* handle, size_t offset, ConstByteSpan data) +seek_and_write(void* handle, size_t offset, std::span data) { CHECK(handle); const auto* fd = static_cast(handle); @@ -121,7 +121,8 @@ bool flush_file(void* handle) { CHECK(handle); - if (const auto* fd = static_cast(handle); *fd != INVALID_HANDLE_VALUE) { + if (const auto* fd = static_cast(handle); + *fd != INVALID_HANDLE_VALUE) { return FlushFileBuffers(*fd); } return true; diff --git a/src/streaming/zarr.common.cpp b/src/streaming/zarr.common.cpp index e176db26..861ef7cc 100644 --- a/src/streaming/zarr.common.cpp +++ b/src/streaming/zarr.common.cpp @@ -102,7 +102,7 @@ zarr::shards_along_dimension(const ZarrDimension& dimension) } bool -zarr::compress_in_place(ByteVector& data, +zarr::compress_in_place(std::vector& data, const zarr::BloscCompressionParams& params, size_t type_size) { diff --git a/src/streaming/zarr.common.hh b/src/streaming/zarr.common.hh index 3f7b605e..af8d95c1 100644 --- a/src/streaming/zarr.common.hh +++ b/src/streaming/zarr.common.hh @@ -3,7 +3,6 @@ #include "acquire.zarr.h" #include "thread.pool.hh" #include "array.dimensions.hh" -#include "definitions.hh" #include "blosc.compression.params.hh" namespace zarr { @@ -82,7 +81,7 @@ shards_along_dimension(const ZarrDimension& dimension); * @return true if compression was successful, false otherwise. */ bool -compress_in_place(ByteVector& data, +compress_in_place(std::vector& data, const BloscCompressionParams& params, size_t type_size); diff --git a/tests/unit-tests/downsampler-odd-z.cpp b/tests/unit-tests/downsampler-odd-z.cpp index 25414a01..7a28cf9c 100644 --- a/tests/unit-tests/downsampler-odd-z.cpp +++ b/tests/unit-tests/downsampler-odd-z.cpp @@ -6,7 +6,7 @@ template std::vector create_test_image(size_t width, size_t height, T value = 100) { - ByteVector data(width * height * sizeof(T), 0); + std::vector data(width * height * sizeof(T), 0); auto* typed_data = reinterpret_cast(data.data()); for (size_t i = 0; i < width * height; ++i) { diff --git a/tests/unit-tests/downsampler.cpp b/tests/unit-tests/downsampler.cpp index 7b89b4fa..20c47082 100644 --- a/tests/unit-tests/downsampler.cpp +++ b/tests/unit-tests/downsampler.cpp @@ -12,7 +12,7 @@ template std::vector create_test_image(size_t width, size_t height, T value = 100) { - ByteVector data(width * height * sizeof(T), 0); + std::vector data(width * height * sizeof(T), 0); auto* typed_data = reinterpret_cast(data.data()); for (size_t i = 0; i < width * height; ++i) { @@ -433,7 +433,7 @@ test_edge_cases() zarr::Downsampler downsampler(config, ZarrDownsamplingMethod_Mean); // Create a test image (11x11) - std::vector image(std::move(ByteVector(11 * 11, 100))); + std::vector image(std::move(std::vector(11 * 11, 100))); downsampler.add_frame(image); std::vector downsampled; @@ -467,7 +467,7 @@ test_min_max_downsampling() // Create a test image with a pattern that will show different results for // min/max/mean std::vector image( - std::move(ByteVector(10 * 10 * sizeof(uint8_t), 0))); + std::move(std::vector(10 * 10 * sizeof(uint8_t), 0))); { auto* typed_data = reinterpret_cast(image.data()); @@ -660,7 +660,7 @@ test_pattern_downsampling() // Create a test image with a gradient pattern std::vector image( - std::move(ByteVector(8 * 8 * sizeof(uint16_t), 0))); + std::move(std::vector(8 * 8 * sizeof(uint16_t), 0))); std::vector expected_mean(4 * 4); std::vector expected_min(4 * 4); diff --git a/tests/unit-tests/frame-queue.cpp b/tests/unit-tests/frame-queue.cpp index 0faee446..6a40187f 100644 --- a/tests/unit-tests/frame-queue.cpp +++ b/tests/unit-tests/frame-queue.cpp @@ -16,7 +16,7 @@ test_basic_operations() CHECK(queue.empty()); CHECK(!queue.full()); - ByteVector data(1024); + std::vector data(1024); for (size_t i = 0; i < data.size(); ++i) { data[i] = i % 256; } @@ -52,13 +52,13 @@ test_capacity() // Fill the queue for (size_t i = 0; i < capacity; ++i) { - std::vector frame(std::move(ByteVector(100, i))); + std::vector frame(std::move(std::vector(100, i))); bool result = queue.push(frame, std::to_string(i)); CHECK(result); } // Queue should be full (next push should fail) - std::vector extra_frame(std::move(ByteVector(100))); + std::vector extra_frame(std::move(std::vector(100))); bool push_result = queue.push(extra_frame, std::to_string(capacity)); CHECK(!push_result); CHECK(queue.size() == capacity); @@ -72,7 +72,7 @@ test_capacity() CHECK(received_key == "0"); // Should be able to push again - std::vector new_frame(std::move(ByteVector(100, 99))); + std::vector new_frame(std::move(std::vector(100, 99))); push_result = queue.push(new_frame, std::to_string(capacity)); CHECK(push_result); CHECK(queue.size() == capacity); @@ -92,7 +92,7 @@ test_producer_consumer() std::thread producer([&queue, n_frames, frame_size]() { for (size_t i = 0; i < n_frames; ++i) { std::vector frame( - std::move(ByteVector(frame_size, i % 256))); + std::move(std::vector(frame_size, i % 256))); // Try until successful while (!queue.push(frame, "spam")) { @@ -140,7 +140,7 @@ test_throughput() // Create large frame for testing std::vector large_frame(frame_size, 42); - std::vector data(std::move(ByteVector(large_frame))); + std::vector data(std::move(std::vector(large_frame))); auto start_time = std::chrono::high_resolution_clock::now(); From fd7c9a8fbd667318d0b7d74935ddb1ac3a952e45 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 16 Oct 2025 20:50:06 +0200 Subject: [PATCH 22/38] Set env a different way in S3 Python tests --- .github/workflows/test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 63887d86..644636a1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -165,12 +165,12 @@ jobs: run: python -m pip install ".[testing]" - name: Test Python - run: | - echo "AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID" >>.env - echo "AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" >>.env - echo "ZARR_S3_ENDPOINT=$ZARR_S3_ENDPOINT" >>.env - echo "ZARR_S3_BUCKET_NAME=$ZARR_S3_BUCKET_NAME" >>.env - python -m pytest -s -k test_stream_data_to_s3 + env: + ZARR_S3_ENDPOINT: ${{ env.MINIO_URL }} + ZARR_S3_BUCKET_NAME: ${{ env.MINIO_BUCKET }} + AWS_ACCESS_KEY_ID: ${{ env.MINIO_ACCESS_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ env.MINIO_SECRET_KEY }} + run: python -m pytest -s -k test_stream_data_to_s3 test-python: From 839a0f94a2a77a099e39f322f9d7a736e8459f66 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Fri, 17 Oct 2025 11:58:51 +0200 Subject: [PATCH 23/38] (wip) yet another Array refactor --- src/streaming/array.cpp | 155 ++------------------------------- src/streaming/array.hh | 58 +++++++++++-- src/streaming/fs.array.cpp | 12 +++ src/streaming/fs.array.hh | 8 +- src/streaming/s3.array.cpp | 171 ++++++++++++++++++++++++++++++++++--- src/streaming/s3.array.hh | 28 +++++- 6 files changed, 261 insertions(+), 171 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index f1449569..06d06a69 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -66,7 +66,7 @@ zarr::Array::Array(std::shared_ptr config, : ArrayBase(config, thread_pool) , bytes_to_flush_{ 0 } , frames_written_{ 0 } - , append_chunk_index_{ 0 } + , append_shard_index_{ 0 } , current_layer_{ 0 } , is_closing_{ false } { @@ -86,7 +86,7 @@ zarr::Array::Array(std::shared_ptr config, std::ranges::fill(table, std::numeric_limits::max()); } - data_root_ = node_path_() + "/c/" + std::to_string(append_chunk_index_); + data_root_ = node_path_() + "/c/" + std::to_string(append_shard_index_); } size_t @@ -224,7 +224,7 @@ zarr::Array::make_metadata_(std::string& metadata_str) crc32_codec, }); - configuration["index_location"] = "end"; + configuration["index_location"] = index_location_(); configuration["codecs"] = json::array({ codec }); if (config_->compression_params) { @@ -263,7 +263,7 @@ zarr::Array::close_() if (bytes_to_flush_ > 0) { CHECK(compress_and_flush_data_()); } else if (current_layer_ > 0) { - CHECK(flush_tables_()); + // CHECK(flush_tables_()); } close_io_streams_(); @@ -480,35 +480,6 @@ zarr::Array::collect_chunks_(uint32_t shard_index) return std::move(layer); } -bool -zarr::Array::compress_and_flush_data_() -{ - if (!compress_chunks_()) { - LOG_ERROR("Failed to compress chunk data"); - return false; - } - - update_table_entries_(); - - if (!flush_data_()) { - LOG_ERROR("Failed to flush chunk data"); - return false; - } - - if (is_closing_ || should_rollover_()) { // flush table - if (!flush_tables_()) { - LOG_ERROR("Failed to flush shard tables"); - return false; - } - current_layer_ = 0; - } else { - ++current_layer_; - CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); - } - - return true; -} - bool zarr::Array::should_flush_() const { @@ -529,126 +500,12 @@ zarr::Array::should_rollover_() const return frames_written_ % dims->frames_before_flush() == 0; } -bool -zarr::Array::compress_chunks_() -{ - if (!config_->compression_params) { - return true; // nothing to do - } - - std::atomic all_successful = 1; - - const auto& params = *config_->compression_params; - const size_t bytes_per_px = bytes_of_type(config_->dtype); - - const auto& dims = config_->dimensions; - - const uint32_t chunks_in_memory = chunk_buffers_.size(); - const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; - - std::vector> futures; - futures.reserve(chunks_in_memory); - - for (size_t i = 0; i < chunks_in_memory; ++i) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - const uint32_t chunk_idx = i + chunk_group_offset; - const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); - const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); - auto* shard_table = shard_tables_.data() + shard_idx; - - auto job = [&chunk_buffer = chunk_buffers_[i], - bytes_per_px, - ¶ms, - shard_table, - shard_idx, - chunk_idx, - internal_idx, - promise, - &all_successful](std::string& err) { - bool success = false; - - try { - std::vector compressed_data(chunk_buffer.size() + - BLOSC_MAX_OVERHEAD); - const auto n_bytes_compressed = - blosc_compress_ctx(params.clevel, - params.shuffle, - bytes_per_px, - chunk_buffer.size(), - chunk_buffer.data(), - compressed_data.data(), - compressed_data.size(), - params.codec_id.c_str(), - 0, - 1); - - if (n_bytes_compressed <= 0) { - err = "blosc_compress_ctx failed with code " + - std::to_string(n_bytes_compressed) + " for chunk " + - std::to_string(chunk_idx) + " (internal index " + - std::to_string(internal_idx) + " of shard " + - std::to_string(shard_idx) + ")"; - success = false; - } else { - compressed_data.resize(n_bytes_compressed); - chunk_buffer.swap(compressed_data); - - // update shard table with size - shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); - success = true; - } - } catch (const std::exception& exc) { - err = exc.what(); - } - - promise->set_value(); - - all_successful.fetch_and(static_cast(success)); - return success; - }; - - // one thread is reserved for processing the frame queue and runs - // the entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { - if (std::string err; !job(err)) { - LOG_ERROR(err); - } - } - } - - for (auto& future : futures) { - future.wait(); - } - - return static_cast(all_successful); -} - -void -zarr::Array::update_table_entries_() -{ - const uint32_t chunks_in_memory = chunk_buffers_.size(); - const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; - const auto& dims = config_->dimensions; - - for (auto i = 0; i < chunks_in_memory; ++i) { - const auto& chunk_buffer = chunk_buffers_[i]; - const uint32_t chunk_idx = i + chunk_group_offset; - const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); - const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); - auto& shard_table = shard_tables_[shard_idx]; - - shard_table[2 * internal_idx + 1] = chunk_buffer.size(); - } -} - void zarr::Array::rollover_() { LOG_DEBUG("Rolling over"); close_io_streams_(); - ++append_chunk_index_; - data_root_ = node_path_() + "/c/" + std::to_string(append_chunk_index_); + ++append_shard_index_; + data_root_ = node_path_() + "/c/" + std::to_string(append_shard_index_); } diff --git a/src/streaming/array.hh b/src/streaming/array.hh index db12df52..0afe524b 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -28,7 +28,7 @@ class Array : public ArrayBase uint64_t bytes_to_flush_; uint32_t frames_written_; - uint32_t append_chunk_index_; + uint32_t append_shard_index_; std::string data_root_; bool is_closing_; @@ -39,21 +39,67 @@ class Array : public ArrayBase bool make_metadata_(std::string& metadata) override; [[nodiscard]] bool close_() override; + /** + * @brief Construct the data paths for all shards in the array with the + * current append shard index. + */ void make_data_paths_(); + + /** + * @brief Fill the chunk buffers with empty data, resizing as needed. + */ void fill_buffers_(); + /** + * @brief Determine if we should flush the current chunk buffers to storage. + * @return True if we should flush, false otherwise. + */ bool should_flush_() const; + + /** + * @brief Determine if we should rollover to a new shard along the append + * dimension. + * @return True if we should rollover, false otherwise. + */ bool should_rollover_() const; + /** + * @brief Write the given frame data into the chunk buffers. + * @param data The frame data. + * @return The number of bytes written. + */ size_t write_frame_to_chunks_(std::vector& data); + /** + * @brief Collect all chunks for the given shard index at the current layer. + * @param shard_index The shard index. + * @return The collected shard layer. + */ [[nodiscard]] ShardLayer collect_chunks_(uint32_t shard_index); - [[nodiscard]] bool compress_and_flush_data_(); - [[nodiscard]] bool compress_chunks_(); - void update_table_entries_(); - [[nodiscard]] virtual bool flush_data_() = 0; - [[nodiscard]] virtual bool flush_tables_() = 0; + + /** + * @brief Close all current shard files and prepare for writing to a new + * shard along the append dimension. + */ void rollover_(); + + /** + * @brief Return the location of the shard index for this array ("start" or + * "end"). + * @return The index location. + */ + virtual std::string index_location_() const = 0; + + /** + * @brief Compress and flush all data currently in the chunk buffers to the + * underlying storage. + * @return True on success, false on failure. + */ + [[nodiscard]] virtual bool compress_and_flush_data_() = 0; + + /** + * @brief Close all open IO streams associated with this array. + */ virtual void close_io_streams_() = 0; friend class MultiscaleArray; diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index 5715545e..30ef1c5a 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -276,6 +276,12 @@ zarr::FSArray::flush_tables_() return true; } +bool +zarr::FSArray::compress_and_flush_data_() +{ + return false; +} + void zarr::FSArray::close_io_streams_() { @@ -301,3 +307,9 @@ zarr::FSArray::get_handle_(const std::string& path) handles_.emplace(path, handle); return handle; } + +std::string +zarr::FSArray::index_location_() const +{ + return "start"; +} diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 2322eba7..9c8208ec 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -18,11 +18,13 @@ class FSArray final std::unordered_map> handles_; bool write_metadata_() override; - - bool flush_data_() override; - bool flush_tables_() override; + std::string index_location_() const override; + bool compress_and_flush_data_() override; void close_io_streams_() override; + bool flush_data_(); + bool flush_tables_(); + std::shared_ptr get_handle_(const std::string& path); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index a13b10d7..24f9f04e 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -1,6 +1,8 @@ #include "macros.hh" #include "s3.array.hh" +#include "zarr.common.hh" +#include #include #include // memcpy @@ -36,6 +38,165 @@ zarr::S3Array::write_metadata_() return success; } +std::string +zarr::S3Array::index_location_() const +{ + return "end"; +} + +bool +zarr::S3Array::compress_and_flush_data_() +{ + if (!compress_chunks_()) { + LOG_ERROR("Failed to compress chunk data"); + return false; + } + + update_table_entries_(); + + if (!flush_data_()) { + LOG_ERROR("Failed to flush chunk data"); + return false; + } + + if (is_closing_ || should_rollover_()) { // flush table + if (!flush_tables_()) { + LOG_ERROR("Failed to flush shard tables"); + return false; + } + current_layer_ = 0; + } else { + ++current_layer_; + CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); + } + + return true; +} + +void +zarr::S3Array::close_io_streams_() +{ + for (const auto& key : data_paths_) { + EXPECT(finalize_object(key), "Failed to finalize S3 object at ", key); + } + + data_paths_.clear(); +} + +bool +zarr::S3Array::compress_chunks_() +{ + if (!config_->compression_params) { + return true; // nothing to do + } + + std::atomic all_successful = 1; + + const auto& params = *config_->compression_params; + const size_t bytes_per_px = bytes_of_type(config_->dtype); + + const auto& dims = config_->dimensions; + + const uint32_t chunks_in_memory = chunk_buffers_.size(); + const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; + + std::vector> futures; + futures.reserve(chunks_in_memory); + + for (size_t i = 0; i < chunks_in_memory; ++i) { + auto promise = std::make_shared>(); + futures.emplace_back(promise->get_future()); + + const uint32_t chunk_idx = i + chunk_group_offset; + const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); + const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); + auto* shard_table = shard_tables_.data() + shard_idx; + + auto job = [&chunk_buffer = chunk_buffers_[i], + bytes_per_px, + ¶ms, + shard_table, + shard_idx, + chunk_idx, + internal_idx, + promise, + &all_successful](std::string& err) { + bool success = false; + + try { + std::vector compressed_data(chunk_buffer.size() + + BLOSC_MAX_OVERHEAD); + const auto n_bytes_compressed = + blosc_compress_ctx(params.clevel, + params.shuffle, + bytes_per_px, + chunk_buffer.size(), + chunk_buffer.data(), + compressed_data.data(), + compressed_data.size(), + params.codec_id.c_str(), + 0, + 1); + + if (n_bytes_compressed <= 0) { + err = "blosc_compress_ctx failed with code " + + std::to_string(n_bytes_compressed) + " for chunk " + + std::to_string(chunk_idx) + " (internal index " + + std::to_string(internal_idx) + " of shard " + + std::to_string(shard_idx) + ")"; + success = false; + } else { + compressed_data.resize(n_bytes_compressed); + chunk_buffer.swap(compressed_data); + + // update shard table with size + shard_table->at(2 * internal_idx + 1) = chunk_buffer.size(); + success = true; + } + } catch (const std::exception& exc) { + err = exc.what(); + } + + promise->set_value(); + + all_successful.fetch_and(static_cast(success)); + return success; + }; + + // one thread is reserved for processing the frame queue and runs + // the entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || !thread_pool_->push_job(job)) { + if (std::string err; !job(err)) { + LOG_ERROR(err); + } + } + } + + for (auto& future : futures) { + future.wait(); + } + + return static_cast(all_successful); +} + +void +zarr::S3Array::update_table_entries_() +{ + const uint32_t chunks_in_memory = chunk_buffers_.size(); + const uint32_t chunk_group_offset = current_layer_ * chunks_in_memory; + const auto& dims = config_->dimensions; + + for (auto i = 0; i < chunks_in_memory; ++i) { + const auto& chunk_buffer = chunk_buffers_[i]; + const uint32_t chunk_idx = i + chunk_group_offset; + const uint32_t shard_idx = dims->shard_index_for_chunk(chunk_idx); + const uint32_t internal_idx = dims->shard_internal_index(chunk_idx); + auto& shard_table = shard_tables_[shard_idx]; + + shard_table[2 * internal_idx + 1] = chunk_buffer.size(); + } +} + bool zarr::S3Array::flush_data_() { @@ -172,13 +333,3 @@ zarr::S3Array::flush_tables_() return true; } - -void -zarr::S3Array::close_io_streams_() -{ - for (const auto& key : data_paths_) { - EXPECT(finalize_object(key), "Failed to finalize S3 object at ", key); - } - - data_paths_.clear(); -} diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index 3a3b9bb9..d9e1da77 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -15,9 +15,31 @@ class S3Array final protected: bool write_metadata_() override; - - bool flush_data_() override; - bool flush_tables_() override; + std::string index_location_() const override; + bool compress_and_flush_data_() override; void close_io_streams_() override; + + /** + * @brief Compress all the chunk buffers in place. + * @return True on success, false on failure. + */ + bool compress_chunks_(); + + /** + * @brief Update the shard tables with the sizes of the compressed chunks. + */ + void update_table_entries_(); + + /** + * @brief Flush the chunk data to S3 or intermediate buffers. + * @return True on success, false on failure. + */ + bool flush_data_(); + + /** + * @brief Flush the shard tables to S3 or intermediate buffers. + * @return True on success, false on failure. + */ + bool flush_tables_(); }; } // namespace zarr \ No newline at end of file From ae181357c960e939a3f55d3397c1b509a9a4f911 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Fri, 17 Oct 2025 13:52:00 +0200 Subject: [PATCH 24/38] (wip) reinstate try-catch in array-write-even.cpp --- tests/unit-tests/array-write-even.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index 1b072380..814e30f9 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -104,7 +104,7 @@ main() const ZarrDataType dtype = ZarrDataType_uint16; const unsigned int nbytes_px = zarr::bytes_of_type(dtype); - // try { + try { auto thread_pool = std::make_shared( 0, [](const std::string& err) { LOG_ERROR("Error: ", err); }); @@ -210,9 +210,9 @@ main() CHECK(!fs::is_directory(data_root / "c" / std::to_string(shards_in_t))); retval = 0; - // } catch (const std::exception& exc) { - // LOG_ERROR("Exception: ", exc.what()); - // } + } catch (const std::exception& exc) { + LOG_ERROR("Exception: ", exc.what()); + } // cleanup if (fs::exists(base_dir)) { From e33e60dbb5c5d34d00d2ff3bf46f6238fe14f8a2 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Tue, 21 Oct 2025 10:27:28 -0400 Subject: [PATCH 25/38] (wip) don't block FSArray::write until closing the shard (1/n) --- src/streaming/array.hh | 1 - src/streaming/fs.array.cpp | 315 ++++++++++++++++++++++++++----------- src/streaming/fs.array.hh | 11 ++ src/streaming/s3.array.hh | 2 + 4 files changed, 237 insertions(+), 92 deletions(-) diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 0afe524b..b0c4c9a2 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -33,7 +33,6 @@ class Array : public ArrayBase bool is_closing_; uint32_t current_layer_; - std::vector shard_file_offsets_; std::vector> shard_tables_; bool make_metadata_(std::string& metadata) override; diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index 30ef1c5a..c6b4c999 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -1,6 +1,8 @@ #include "fs.array.hh" #include "macros.hh" +#include "zarr.common.hh" +#include #include #include // memcp @@ -98,6 +100,9 @@ zarr::FSArray::FSArray(std::shared_ptr config, : Array(config, thread_pool) , FSStorage(file_handle_pool) { + table_size_ = + config_->dimensions->chunks_per_shard() * 2 * sizeof(uint64_t) + 4; + std::ranges::fill(shard_file_offsets_, table_size_); } bool @@ -125,92 +130,80 @@ zarr::FSArray::write_metadata_() bool zarr::FSArray::flush_data_() { - // construct paths to shard sinks if they don't already exist - if (data_paths_.empty()) { - make_data_paths_(); - } - - // create parent directories if needed - const auto parent_paths = get_parent_paths(data_paths_); - CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist - - const auto& dims = config_->dimensions; - - const auto n_shards = dims->number_of_shards(); - CHECK(data_paths_.size() == n_shards); - - std::atomic all_successful = 1; - - std::vector> futures; - - // wait for the chunks in each shard to finish compressing, then defragment - // and write the shard - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const std::string data_path = data_paths_[shard_idx]; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - - const auto shard_data = collect_chunks_(shard_idx); - if (shard_data.chunks.empty()) { - LOG_ERROR("Failed to collect chunks for shard ", shard_idx); - return false; - } - if (shard_data.offset != *file_offset) { - LOG_ERROR("Inconsistent file offset for shard ", - shard_idx, - ": expected ", - *file_offset, - ", got ", - shard_data.offset); - return false; - } - - size_t layer_offset = shard_data.offset; - - for (auto& chunk : shard_data.chunks) { - auto promise = std::make_shared>(); - futures.emplace_back(promise->get_future()); - - const auto handle = get_handle_(data_path); - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", data_path); - return false; - } - - const auto chunk_size = chunk.size(); // we move it below - auto job = [data_path, - handle, - layer_offset, - chunk = std::move(chunk), - promise](std::string& err) { - bool success; - try { - success = seek_and_write(handle.get(), layer_offset, chunk); - } catch (const std::exception& exc) { - err = "Failed to write chunk at offset " + - std::to_string(layer_offset) + " to path " + - data_path + ": " + exc.what(); - success = false; - } - - promise->set_value(); - return success; - }; - - // one thread is reserved for processing the frame queue and runs - // the entire lifetime of the stream - if (thread_pool_->n_threads() == 1 || - !thread_pool_->push_job(job)) { - std::string err; - if (!job(err)) { - LOG_ERROR(err); - } - } - - layer_offset += chunk_size; - } - - *file_offset = layer_offset; - } + // std::atomic all_successful = 1; + // + // std::vector> futures; + // + // // wait for the chunks in each shard to finish compressing, then + // defragment + // // and write the shard + // for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + // const std::string data_path = data_paths_[shard_idx]; + // auto* file_offset = shard_file_offsets_.data() + shard_idx; + // + // const auto shard_data = collect_chunks_(shard_idx); + // if (shard_data.chunks.empty()) { + // LOG_ERROR("Failed to collect chunks for shard ", shard_idx); + // return false; + // } + // if (shard_data.offset != *file_offset) { + // LOG_ERROR("Inconsistent file offset for shard ", + // shard_idx, + // ": expected ", + // *file_offset, + // ", got ", + // shard_data.offset); + // return false; + // } + // + // size_t layer_offset = shard_data.offset; + // + // for (auto& chunk : shard_data.chunks) { + // auto promise = std::make_shared>(); + // futures.emplace_back(promise->get_future()); + // + // const auto handle = get_handle_(data_path); + // if (handle == nullptr) { + // LOG_ERROR("Failed to get file handle for ", data_path); + // return false; + // } + // + // const auto chunk_size = chunk.size(); // we move it below + // auto job = [data_path, + // handle, + // layer_offset, + // chunk = std::move(chunk), + // promise](std::string& err) { + // bool success; + // try { + // success = seek_and_write(handle.get(), layer_offset, + // chunk); + // } catch (const std::exception& exc) { + // err = "Failed to write chunk at offset " + + // std::to_string(layer_offset) + " to path " + + // data_path + ": " + exc.what(); + // success = false; + // } + // + // promise->set_value(); + // return success; + // }; + // + // // one thread is reserved for processing the frame queue and runs + // // the entire lifetime of the stream + // if (thread_pool_->n_threads() == 1 || + // !thread_pool_->push_job(job)) { + // std::string err; + // if (!job(err)) { + // LOG_ERROR(err); + // } + // } + // + // layer_offset += chunk_size; + // } + // + // *file_offset = layer_offset; + // } // wait for all threads to finish // for (auto& future : futures) { @@ -234,7 +227,6 @@ zarr::FSArray::flush_tables_() for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { const auto* shard_table = shard_tables_.data() + shard_idx; - auto* file_offset = shard_file_offsets_.data() + shard_idx; const size_t table_size = shard_table->size() * sizeof(uint64_t); std::vector table(table_size + sizeof(uint32_t), 0); @@ -253,14 +245,13 @@ zarr::FSArray::flush_tables_() return false; } - if (!seek_and_write(handle.get(), *file_offset, table)) { + if (!seek_and_write(handle.get(), 0, table)) { LOG_ERROR("Failed to write table and checksum to shard ", shard_idx, " at path ", data_path); return false; } - *file_offset += table.size(); handles_.erase(data_path); // close the handle } @@ -270,7 +261,7 @@ zarr::FSArray::flush_tables_() for (auto& table : shard_tables_) { std::ranges::fill(table, std::numeric_limits::max()); } - std::ranges::fill(shard_file_offsets_, 0); + std::ranges::fill(shard_file_offsets_, table_size_); } return true; @@ -279,7 +270,149 @@ zarr::FSArray::flush_tables_() bool zarr::FSArray::compress_and_flush_data_() { - return false; + // construct paths to shard sinks if they don't already exist + if (data_paths_.empty()) { + make_data_paths_(); + } + + // create parent directories if needed + const auto parent_paths = get_parent_paths(data_paths_); + CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist + + const auto& dims = config_->dimensions; + + const auto n_shards = dims->number_of_shards(); + CHECK(data_paths_.size() == n_shards); + + std::vector mutexes(n_shards); + + const uint32_t chunks_per_shard = dims->chunks_per_shard(); + const uint32_t chunks_in_mem = dims->number_of_chunks_in_memory(); + const uint32_t n_layers = dims->chunk_layers_per_shard(); + const uint32_t chunks_per_layer = chunks_per_shard / n_layers; + + const size_t bytes_per_px = bytes_of_type(config_->dtype); + + // this layer's entries in the shard table begin here + const uint32_t layer_offset = current_layer_ * chunks_per_layer; + + // this layer's entries in the (global) chunk grid begin here + const uint32_t chunk_offset = current_layer_ * chunks_in_mem; + + for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { + const std::string data_path = data_paths_[shard_idx]; + + // chunk storage is at chunk_index - chunk_offset + const auto chunk_indices_this_layer = + dims->chunk_indices_for_shard_layer(shard_idx, current_layer_); + + auto* shard_table = shard_tables_.data() + shard_idx; + auto* file_offset = shard_file_offsets_.data() + shard_idx; + auto* shard_mutex = mutexes.data() + shard_idx; + + auto handle = get_handle_(data_path); + if (handle == nullptr) { + LOG_ERROR("Failed to get file handle for ", data_path); + return false; + } + + const auto& params = config_->compression_params; + + for (auto& chunk_idx : chunk_indices_this_layer) { + CHECK(chunk_idx >= chunk_offset); + uint32_t internal_index = dims->shard_internal_index(chunk_idx); + const auto& chunk_data = chunk_buffers_[chunk_idx - chunk_offset]; + auto job = [&chunk_data, + ¶ms, + handle, + data_path, + bytes_per_px, + internal_index, + shard_table, + file_offset, + shard_mutex](std::string& err) { + bool success = true; + std::vector compressed; + const uint8_t* data_out = nullptr; + size_t chunk_size_out = 0; + + try { + // compress here + if (params) { + compressed.resize(chunk_data.size() + + BLOSC_MAX_OVERHEAD); + const auto n_bytes_compressed = + blosc_compress_ctx(params->clevel, + params->shuffle, + bytes_per_px, + chunk_data.size(), + chunk_data.data(), + compressed.data(), + compressed.size(), + params->codec_id.c_str(), + 0, + 1); + if (n_bytes_compressed <= 0) { + err = "blosc_compress_ctx failed with code " + + std::to_string(n_bytes_compressed) + + " for chunk " + + std::to_string(internal_index) + " of shard "; + success = false; + } + data_out = compressed.data(); + chunk_size_out = n_bytes_compressed; + } else { + data_out = chunk_data.data(); + chunk_size_out = chunk_data.size(); + } + EXPECT(success, err); + EXPECT(data_out != nullptr, err); + EXPECT(chunk_size_out != 0, err); + + size_t file_offset_local; + { + std::lock_guard lock(*shard_mutex); + file_offset_local = *file_offset; + *file_offset += chunk_size_out; + } + + success = + seek_and_write(handle.get(), + file_offset_local, + std::span(data_out, chunk_size_out)); + } catch (const std::exception& exc) { + err = "Failed to compress chunk " + + std::to_string(internal_index) + " of shard " + ": " + + exc.what(); + success = false; + } + + return success; + }; + + // one thread is reserved for processing the frame queue and runs + // the entire lifetime of the stream + if (thread_pool_->n_threads() == 1 || + !thread_pool_->push_job(job)) { + if (std::string err; !job(err)) { + LOG_ERROR(err); + } + } + } + } + + if (is_closing_ || should_rollover_()) { // flush table + if (!flush_tables_()) { + LOG_ERROR("Failed to flush shard tables"); + return false; + } + current_layer_ = 0; + } else { + ++current_layer_; + CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); + } + + return true; } void diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 9c8208ec..c300e671 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -3,6 +3,9 @@ #include "array.hh" #include "fs.storage.hh" +#include +#include + namespace zarr { class FSArray final : public Array @@ -15,6 +18,8 @@ class FSArray final protected: std::mutex mutex_; + size_t table_size_; + std::vector>> shard_futures_; std::unordered_map> handles_; bool write_metadata_() override; @@ -25,6 +30,12 @@ class FSArray final bool flush_data_(); bool flush_tables_(); + /** + * @brief Get a file handle for the given path, creating it and adding it to + * the local handle pool if it does not already exist. + * @param path The file path. + * @return The file handle. + */ std::shared_ptr get_handle_(const std::string& path); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index d9e1da77..26155395 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -14,6 +14,8 @@ class S3Array final std::shared_ptr s3_connection_pool); protected: + std::vector shard_file_offsets_; + bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; From e6df041a9f35b428231c15b8b410f9343b156839 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 23 Oct 2025 15:31:58 -0400 Subject: [PATCH 26/38] (wip) --- src/streaming/array.cpp | 95 ++++++---------- src/streaming/array.dimensions.cpp | 21 ++-- src/streaming/array.dimensions.hh | 24 ++++- src/streaming/array.hh | 26 ++--- src/streaming/fs.array.cpp | 102 +----------------- src/streaming/fs.array.hh | 4 +- src/streaming/s3.array.cpp | 60 ++++++++--- src/streaming/s3.array.hh | 15 +-- src/streaming/thread.pool.cpp | 2 +- tests/unit-tests/array-write-even.cpp | 29 ++--- .../array-write-ragged-append-dim.cpp | 24 ++--- 11 files changed, 161 insertions(+), 241 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 06d06a69..03745140 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -67,8 +67,8 @@ zarr::Array::Array(std::shared_ptr config, , bytes_to_flush_{ 0 } , frames_written_{ 0 } , append_shard_index_{ 0 } - , current_layer_{ 0 } , is_closing_{ false } + , current_layer_{ 0 } { const size_t n_chunks = config_->dimensions->number_of_chunks_in_memory(); EXPECT(n_chunks > 0, "Array has zero chunks in memory"); @@ -134,11 +134,15 @@ zarr::Array::write_frame(std::vector& data) bytes_to_flush_ += bytes_written; ++frames_written_; - if (should_flush_()) { + if (should_flush_layer_()) { CHECK(compress_and_flush_data_()); + const auto& dims = config_->dimensions; + const auto lps = dims->chunk_layers_per_shard(); + current_layer_ = (current_layer_ + 1) % lps; + if (should_rollover_()) { - rollover_(); + close_shards_(); // also writes the shard tables CHECK(write_metadata_()); } bytes_to_flush_ = 0; @@ -260,10 +264,18 @@ zarr::Array::close_() bool retval = false; is_closing_ = true; try { + const bool flush_tables = bytes_to_flush_ > 0 || current_layer_ > 0; + if (bytes_to_flush_ > 0) { - CHECK(compress_and_flush_data_()); - } else if (current_layer_ > 0) { - // CHECK(flush_tables_()); + if (!compress_and_flush_data_()) { + LOG_ERROR("Failed to flush remaining data on close"); + return false; + } + } + + if (flush_tables && !flush_tables_()) { + LOG_ERROR("Failed to flush shard tables on close"); + return false; } close_io_streams_(); @@ -434,78 +446,33 @@ zarr::Array::write_frame_to_chunks_(std::vector& data) return bytes_written; } -zarr::Array::ShardLayer -zarr::Array::collect_chunks_(uint32_t shard_index) -{ - const auto& dims = config_->dimensions; - CHECK(shard_index < dims->number_of_shards()); - - const auto chunks_per_shard = dims->chunks_per_shard(); - const auto chunks_in_mem = dims->number_of_chunks_in_memory(); - const auto n_layers = dims->chunk_layers_per_shard(); - - const auto chunks_per_layer = chunks_per_shard / n_layers; - const auto layer_offset = current_layer_ * chunks_per_layer; - const auto chunk_offset = current_layer_ * chunks_in_mem; - - auto& shard_table = shard_tables_[shard_index]; - const auto file_offset = shard_file_offsets_[shard_index]; - shard_table[2 * layer_offset] = file_offset; - - uint64_t last_chunk_offset = shard_table[2 * layer_offset]; - uint64_t last_chunk_size = shard_table[2 * layer_offset + 1]; - - for (auto i = 1; i < chunks_per_layer; ++i) { - const auto offset_idx = 2 * (layer_offset + i); - const auto size_idx = offset_idx + 1; - if (shard_table[size_idx] == std::numeric_limits::max()) { - continue; - } - - shard_table[offset_idx] = last_chunk_offset + last_chunk_size; - last_chunk_offset = shard_table[offset_idx]; - last_chunk_size = shard_table[size_idx]; - } - - const auto chunk_indices_this_layer = - dims->chunk_indices_for_shard_layer(shard_index, current_layer_); - - ShardLayer layer{ file_offset, {} }; - layer.chunks.reserve(chunk_indices_this_layer.size()); - - for (const auto& idx : chunk_indices_this_layer) { - layer.chunks.emplace_back(chunk_buffers_[idx - chunk_offset]); - } - - return std::move(layer); -} - bool -zarr::Array::should_flush_() const +zarr::Array::should_flush_layer_() const { const auto& dims = config_->dimensions; - size_t frames_before_flush = dims->final_dim().chunk_size_px; - for (auto i = 1; i < dims->ndims() - 2; ++i) { - frames_before_flush *= dims->at(i).array_size_px; - } - - CHECK(frames_before_flush > 0); - return frames_written_ % frames_before_flush == 0; + const size_t frames_per_layer = dims->frames_per_layer(); + return frames_written_ % frames_per_layer == 0; } bool zarr::Array::should_rollover_() const { const auto& dims = config_->dimensions; - return frames_written_ % dims->frames_before_flush() == 0; + const size_t frames_per_shard = dims->frames_per_shard(); + return frames_written_ % frames_per_shard == 0; } void -zarr::Array::rollover_() +zarr::Array::close_shards_() { LOG_DEBUG("Rolling over"); + EXPECT(flush_tables_(), "Failed to flush shard tables during rollover"); close_io_streams_(); - ++append_shard_index_; - data_root_ = node_path_() + "/c/" + std::to_string(append_shard_index_); + + // advance to the next shard index + if (!is_closing_) { + data_root_ = + node_path_() + "/c/" + std::to_string(++append_shard_index_); + } } diff --git a/src/streaming/array.dimensions.cpp b/src/streaming/array.dimensions.cpp index 6f9c9806..2c42ad04 100644 --- a/src/streaming/array.dimensions.cpp +++ b/src/streaming/array.dimensions.cpp @@ -14,8 +14,7 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, const auto ndims = dims_.size(); EXPECT(ndims > 2, "Array must have at least three dimensions."); - frames_before_flush_ = - final_dim().chunk_size_px * final_dim().shard_size_chunks; + frames_per_layer_ = final_dim().chunk_size_px; for (auto i = 0; i < ndims; ++i) { const auto& dim = dims_[i]; @@ -26,7 +25,7 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, number_of_chunks_in_memory_ *= zarr::chunks_along_dimension(dim); number_of_shards_ *= zarr::shards_along_dimension(dim); if (i < ndims - 2) { - frames_before_flush_ *= dim.array_size_px; + frames_per_layer_ *= dim.array_size_px; } } } @@ -36,7 +35,7 @@ ArrayDimensions::ArrayDimensions(std::vector&& dims, EXPECT(chunks_per_shard_ > 0, "Array must have at least one chunk per shard."); EXPECT(number_of_shards_ > 0, "Array must have at least one shard."); - EXPECT(frames_before_flush_ > 0, + EXPECT(frames_per_layer_ > 0, "Array must have at least one frame before flush."); chunk_indices_for_shard_.resize(number_of_shards_); @@ -231,14 +230,16 @@ ArrayDimensions::shard_internal_index(uint32_t chunk_index) const return shard_internal_indices_.at(chunk_index); } -/** - * @brief Get the number of frames before a flush is triggered. - * @return The number of frames before a flush. - */ uint64_t -ArrayDimensions::frames_before_flush() const +ArrayDimensions::frames_per_layer() const { - return frames_before_flush_; + return frames_per_layer_; +} + +uint64_t +ArrayDimensions::frames_per_shard() const +{ + return frames_per_layer_ * dims_[0].shard_size_chunks; } uint32_t diff --git a/src/streaming/array.dimensions.hh b/src/streaming/array.dimensions.hh index a9511027..35ace61a 100644 --- a/src/streaming/array.dimensions.hh +++ b/src/streaming/array.dimensions.hh @@ -133,8 +133,9 @@ class ArrayDimensions /** * @brief Get the chunk indices for a specific layer within a shard. * @param shard_index The index of the shard. - * @param layer - * @return + * @param layer The layer within the shard. + * @return A vector of chunk indices for the specified layer within the + * shard. */ std::vector chunk_indices_for_shard_layer(uint32_t shard_index, uint32_t layer) const; @@ -146,7 +147,22 @@ class ArrayDimensions */ uint32_t shard_internal_index(uint32_t chunk_index) const; - uint64_t frames_before_flush() const; + /** + * @brief Get the number of frames in a shard layer, i.e., the number of + * frames that can be stored in one layer of chunks within a shard. + * @note This is used to determine when to flush chunk buffers to storage. + * @return The number of frames in a shard layer. + */ + uint64_t frames_per_layer() const; + + /** + * @brief Get the number of frames per full shard, i.e., the number of + * frames that are written before closing and rolling over to a new shard. + * @details This is just the product of the number of frames per layer and + * the number of chunk layers per shard. + * @return The number of frames per shard. + */ + uint64_t frames_per_shard() const; private: std::vector dims_; @@ -162,7 +178,7 @@ class ArrayDimensions std::unordered_map shard_internal_indices_; std::vector> chunk_indices_for_shard_; - uint64_t frames_before_flush_; + uint64_t frames_per_layer_; uint32_t shard_index_for_chunk_(uint32_t chunk_index) const; uint32_t shard_internal_index_(uint32_t chunk_index) const; diff --git a/src/streaming/array.hh b/src/streaming/array.hh index b0c4c9a2..7971e188 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -33,6 +33,7 @@ class Array : public ArrayBase bool is_closing_; uint32_t current_layer_; + std::vector shard_file_offsets_; std::vector> shard_tables_; bool make_metadata_(std::string& metadata) override; @@ -53,12 +54,12 @@ class Array : public ArrayBase * @brief Determine if we should flush the current chunk buffers to storage. * @return True if we should flush, false otherwise. */ - bool should_flush_() const; + bool should_flush_layer_() const; /** - * @brief Determine if we should rollover to a new shard along the append + * @brief Determine if we should roll over to a new shard along the append * dimension. - * @return True if we should rollover, false otherwise. + * @return True if we should roll over, false otherwise. */ bool should_rollover_() const; @@ -70,17 +71,10 @@ class Array : public ArrayBase size_t write_frame_to_chunks_(std::vector& data); /** - * @brief Collect all chunks for the given shard index at the current layer. - * @param shard_index The shard index. - * @return The collected shard layer. + * @brief Finalize all current shard files and close their associated I/O + * streams. Update the data root to point to the next shard index. */ - [[nodiscard]] ShardLayer collect_chunks_(uint32_t shard_index); - - /** - * @brief Close all current shard files and prepare for writing to a new - * shard along the append dimension. - */ - void rollover_(); + void close_shards_(); /** * @brief Return the location of the shard index for this array ("start" or @@ -96,6 +90,12 @@ class Array : public ArrayBase */ [[nodiscard]] virtual bool compress_and_flush_data_() = 0; + /** + * @brief Flush all shard tables to the underlying storage. + * @return True on success, false on failure. + */ + [[nodiscard]] virtual bool flush_tables_() = 0; + /** * @brief Close all open IO streams associated with this array. */ diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index c6b4c999..b455a53d 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -127,93 +127,6 @@ zarr::FSArray::write_metadata_() return success; } -bool -zarr::FSArray::flush_data_() -{ - // std::atomic all_successful = 1; - // - // std::vector> futures; - // - // // wait for the chunks in each shard to finish compressing, then - // defragment - // // and write the shard - // for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - // const std::string data_path = data_paths_[shard_idx]; - // auto* file_offset = shard_file_offsets_.data() + shard_idx; - // - // const auto shard_data = collect_chunks_(shard_idx); - // if (shard_data.chunks.empty()) { - // LOG_ERROR("Failed to collect chunks for shard ", shard_idx); - // return false; - // } - // if (shard_data.offset != *file_offset) { - // LOG_ERROR("Inconsistent file offset for shard ", - // shard_idx, - // ": expected ", - // *file_offset, - // ", got ", - // shard_data.offset); - // return false; - // } - // - // size_t layer_offset = shard_data.offset; - // - // for (auto& chunk : shard_data.chunks) { - // auto promise = std::make_shared>(); - // futures.emplace_back(promise->get_future()); - // - // const auto handle = get_handle_(data_path); - // if (handle == nullptr) { - // LOG_ERROR("Failed to get file handle for ", data_path); - // return false; - // } - // - // const auto chunk_size = chunk.size(); // we move it below - // auto job = [data_path, - // handle, - // layer_offset, - // chunk = std::move(chunk), - // promise](std::string& err) { - // bool success; - // try { - // success = seek_and_write(handle.get(), layer_offset, - // chunk); - // } catch (const std::exception& exc) { - // err = "Failed to write chunk at offset " + - // std::to_string(layer_offset) + " to path " + - // data_path + ": " + exc.what(); - // success = false; - // } - // - // promise->set_value(); - // return success; - // }; - // - // // one thread is reserved for processing the frame queue and runs - // // the entire lifetime of the stream - // if (thread_pool_->n_threads() == 1 || - // !thread_pool_->push_job(job)) { - // std::string err; - // if (!job(err)) { - // LOG_ERROR(err); - // } - // } - // - // layer_offset += chunk_size; - // } - // - // *file_offset = layer_offset; - // } - - // wait for all threads to finish - // for (auto& future : futures) { - // future.wait(); - // } - // - // return static_cast(all_successful); - return true; -} - bool zarr::FSArray::flush_tables_() { @@ -401,17 +314,6 @@ zarr::FSArray::compress_and_flush_data_() } } - if (is_closing_ || should_rollover_()) { // flush table - if (!flush_tables_()) { - LOG_ERROR("Failed to flush shard tables"); - return false; - } - current_layer_ = 0; - } else { - ++current_layer_; - CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); - } - return true; } @@ -429,8 +331,8 @@ std::shared_ptr zarr::FSArray::get_handle_(const std::string& path) { std::unique_lock lock(mutex_); - if (handles_.contains(path)) { - return handles_[path]; + if (const auto it = handles_.find(path); it != handles_.end()) { + return it->second; } void* flags = make_flags(); diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index c300e671..b684505a 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -25,11 +25,9 @@ class FSArray final bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; + bool flush_tables_() override; void close_io_streams_() override; - bool flush_data_(); - bool flush_tables_(); - /** * @brief Get a file handle for the given path, creating it and adding it to * the local handle pool if it does not already exist. diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index 24f9f04e..398d9d2c 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -15,6 +15,8 @@ zarr::S3Array::S3Array(std::shared_ptr config, , S3Storage(*config->bucket_name, s3_connection_pool) { CHECK(config_->dimensions); + + const auto& dims = config_->dimensions; } bool @@ -59,17 +61,6 @@ zarr::S3Array::compress_and_flush_data_() return false; } - if (is_closing_ || should_rollover_()) { // flush table - if (!flush_tables_()) { - LOG_ERROR("Failed to flush shard tables"); - return false; - } - current_layer_ = 0; - } else { - ++current_layer_; - CHECK(current_layer_ < config_->dimensions->chunk_layers_per_shard()); - } - return true; } @@ -197,6 +188,52 @@ zarr::S3Array::update_table_entries_() } } +zarr::Array::ShardLayer +zarr::S3Array::collect_chunks_(uint32_t shard_index) +{ + const auto& dims = config_->dimensions; + CHECK(shard_index < dims->number_of_shards()); + + const auto chunks_per_shard = dims->chunks_per_shard(); + const auto chunks_in_mem = dims->number_of_chunks_in_memory(); + const auto n_layers = dims->chunk_layers_per_shard(); + + const auto chunks_per_layer = chunks_per_shard / n_layers; + const auto layer_offset = current_layer_ * chunks_per_layer; + const auto chunk_offset = current_layer_ * chunks_in_mem; + + auto& shard_table = shard_tables_[shard_index]; + const auto file_offset = shard_file_offsets_[shard_index]; + shard_table[2 * layer_offset] = file_offset; + + uint64_t last_chunk_offset = shard_table[2 * layer_offset]; + uint64_t last_chunk_size = shard_table[2 * layer_offset + 1]; + + for (auto i = 1; i < chunks_per_layer; ++i) { + const auto offset_idx = 2 * (layer_offset + i); + const auto size_idx = offset_idx + 1; + if (shard_table[size_idx] == std::numeric_limits::max()) { + continue; + } + + shard_table[offset_idx] = last_chunk_offset + last_chunk_size; + last_chunk_offset = shard_table[offset_idx]; + last_chunk_size = shard_table[size_idx]; + } + + const auto chunk_indices_this_layer = + dims->chunk_indices_for_shard_layer(shard_index, current_layer_); + + ShardLayer layer{ file_offset, {} }; + layer.chunks.reserve(chunk_indices_this_layer.size()); + + for (const auto& idx : chunk_indices_this_layer) { + layer.chunks.emplace_back(chunk_buffers_[idx - chunk_offset]); + } + + return std::move(layer); +} + bool zarr::S3Array::flush_data_() { @@ -328,7 +365,6 @@ zarr::S3Array::flush_tables_() std::ranges::fill(table, std::numeric_limits::max()); } std::ranges::fill(shard_file_offsets_, 0); - current_layer_ = 0; } return true; diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index 26155395..f12cb962 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -14,11 +14,10 @@ class S3Array final std::shared_ptr s3_connection_pool); protected: - std::vector shard_file_offsets_; - bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; + bool flush_tables_() override; void close_io_streams_() override; /** @@ -33,15 +32,17 @@ class S3Array final void update_table_entries_(); /** - * @brief Flush the chunk data to S3 or intermediate buffers. - * @return True on success, false on failure. + * @brief Collect all the chunks for a given shard index in the current + * layer. + * @param shard_index The index of the shard to collect chunks for. + * @return The collected chunk buffers in a shard layer structure. */ - bool flush_data_(); + ShardLayer collect_chunks_(uint32_t shard_index); /** - * @brief Flush the shard tables to S3 or intermediate buffers. + * @brief Flush the chunk data to S3 or intermediate buffers. * @return True on success, false on failure. */ - bool flush_tables_(); + bool flush_data_(); }; } // namespace zarr \ No newline at end of file diff --git a/src/streaming/thread.pool.cpp b/src/streaming/thread.pool.cpp index 37bc12ad..ef50f642 100644 --- a/src/streaming/thread.pool.cpp +++ b/src/streaming/thread.pool.cpp @@ -105,5 +105,5 @@ zarr::ThreadPool::process_tasks_() uint32_t zarr::ThreadPool::n_threads() const { - return threads_.size(); + return threads_.size() - 1; // exclude frame queue thread } \ No newline at end of file diff --git a/tests/unit-tests/array-write-even.cpp b/tests/unit-tests/array-write-even.cpp index 814e30f9..45ccefb1 100644 --- a/tests/unit-tests/array-write-even.cpp +++ b/tests/unit-tests/array-write-even.cpp @@ -14,8 +14,6 @@ const fs::path base_dir = fs::temp_directory_path() / TEST; constexpr unsigned int array_width = 64, array_height = 48, array_planes = 6, array_channels = 8, array_timepoints = 10; -constexpr unsigned int n_frames = - array_planes * array_channels * array_timepoints; constexpr unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, chunk_channels = 4, chunk_timepoints = 5; @@ -25,6 +23,9 @@ constexpr unsigned int shard_width = 2, shard_height = 1, shard_planes = 1, constexpr unsigned int chunks_per_shard = shard_width * shard_height * shard_planes * shard_channels * shard_timepoints; +constexpr unsigned int n_frames = + array_planes * array_channels * chunk_timepoints * shard_timepoints; + constexpr unsigned int chunks_in_x = (array_width + chunk_width - 1) / chunk_width; // 4 chunks constexpr unsigned int chunks_in_y = @@ -101,12 +102,12 @@ main() int retval = 1; - const ZarrDataType dtype = ZarrDataType_uint16; + constexpr ZarrDataType dtype = ZarrDataType_uint16; const unsigned int nbytes_px = zarr::bytes_of_type(dtype); try { auto thread_pool = std::make_shared( - 0, [](const std::string& err) { LOG_ERROR("Error: ", err); }); + 1, [](const std::string& err) { LOG_ERROR("Error: ", err); }); std::vector dims; dims.emplace_back("t", @@ -132,18 +133,18 @@ main() dims.emplace_back( "x", ZarrDimensionType_Space, array_width, chunk_width, shard_width); - const auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - level_of_detail); - // write the data { + const auto config = std::make_shared( + base_dir.string(), + "", + std::nullopt, + std::nullopt, + std::make_shared(std::move(dims), dtype), + dtype, + std::nullopt, + level_of_detail); + auto writer = std::make_unique( config, thread_pool, std::make_shared()); diff --git a/tests/unit-tests/array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp index 0ac55ac2..3d9171d3 100644 --- a/tests/unit-tests/array-write-ragged-append-dim.cpp +++ b/tests/unit-tests/array-write-ragged-append-dim.cpp @@ -12,29 +12,27 @@ namespace fs = std::filesystem; namespace { const fs::path base_dir = fs::temp_directory_path() / TEST; -const unsigned int array_width = 64, array_height = 48, array_planes = 5; -const unsigned int n_frames = array_planes; +constexpr unsigned int array_width = 64, array_height = 48, array_planes = 5; +constexpr unsigned int n_frames = array_planes; -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2; +constexpr unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2; -const unsigned int shard_width = 2, shard_height = 1, shard_planes = 1; -const unsigned int chunks_per_shard = shard_width * shard_height * shard_planes; +constexpr unsigned int shard_width = 2, shard_height = 1, shard_planes = 1; +constexpr unsigned int chunks_per_shard = shard_width * shard_height * shard_planes; -const unsigned int chunks_in_x = +constexpr unsigned int chunks_in_x = (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = +constexpr unsigned int chunks_in_y = (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = +constexpr unsigned int chunks_in_z = (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int shards_in_x = +constexpr unsigned int shards_in_x = (chunks_in_x + shard_width - 1) / shard_width; // 2 shards -const unsigned int shards_in_y = +constexpr unsigned int shards_in_y = (chunks_in_y + shard_height - 1) / shard_height; // 3 shards -const unsigned int shards_in_z = +constexpr unsigned int shards_in_z = (chunks_in_z + shard_planes - 1) / shard_planes; // 3 shards - -const int level_of_detail = 4; } // namespace void From 51832c31f6e4d097f217a3136b73534efa400017 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 23 Oct 2025 17:00:04 -0400 Subject: [PATCH 27/38] (wip) again --- src/streaming/fs.array.cpp | 34 +++++++++++++------ src/streaming/fs.array.hh | 3 +- .../stream-multiple-arrays-to-filesystem.cpp | 4 +-- .../array-write-ragged-internal-dim.cpp | 2 +- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index b455a53d..355ac35d 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -7,7 +7,6 @@ #include // memcp #include -#include #include #include @@ -165,8 +164,6 @@ zarr::FSArray::flush_tables_() data_path); return false; } - - handles_.erase(data_path); // close the handle } // don't reset state if we're closing @@ -197,8 +194,6 @@ zarr::FSArray::compress_and_flush_data_() const auto n_shards = dims->number_of_shards(); CHECK(data_paths_.size() == n_shards); - std::vector mutexes(n_shards); - const uint32_t chunks_per_shard = dims->chunks_per_shard(); const uint32_t chunks_in_mem = dims->number_of_chunks_in_memory(); const uint32_t n_layers = dims->chunk_layers_per_shard(); @@ -221,7 +216,11 @@ zarr::FSArray::compress_and_flush_data_() auto* shard_table = shard_tables_.data() + shard_idx; auto* file_offset = shard_file_offsets_.data() + shard_idx; - auto* shard_mutex = mutexes.data() + shard_idx; + + if (!shard_mutexes_.contains(data_path)) { + shard_mutexes_.emplace(); + } + auto* shard_mutex = &shard_mutexes_[data_path]; auto handle = get_handle_(data_path); if (handle == nullptr) { @@ -229,21 +228,28 @@ zarr::FSArray::compress_and_flush_data_() return false; } + if (!futures_.contains(data_path)) { + futures_.emplace(data_path, std::vector>{}); + } const auto& params = config_->compression_params; - for (auto& chunk_idx : chunk_indices_this_layer) { + for (auto i = 0; i < chunk_indices_this_layer.size(); ++i) { + const uint32_t chunk_idx = chunk_indices_this_layer[i]; CHECK(chunk_idx >= chunk_offset); uint32_t internal_index = dims->shard_internal_index(chunk_idx); const auto& chunk_data = chunk_buffers_[chunk_idx - chunk_offset]; + auto promise = std::make_shared>(); + futures_[data_path].push_back(promise->get_future()); + auto job = [&chunk_data, ¶ms, handle, data_path, bytes_per_px, internal_index, - shard_table, file_offset, - shard_mutex](std::string& err) { + shard_mutex, + promise](std::string& err) { bool success = true; std::vector compressed; const uint8_t* data_out = nullptr; @@ -295,11 +301,12 @@ zarr::FSArray::compress_and_flush_data_() std::span(data_out, chunk_size_out)); } catch (const std::exception& exc) { err = "Failed to compress chunk " + - std::to_string(internal_index) + " of shard " + ": " + - exc.what(); + std::to_string(internal_index) + " of shard at " + + data_path + ": " + exc.what(); success = false; } + promise->set_value(); return success; }; @@ -321,6 +328,11 @@ void zarr::FSArray::close_io_streams_() { for (const auto& path : data_paths_) { + for (auto& future : futures_[path]) { + future.wait(); + } + futures_.erase(path); + shard_mutexes_.erase(path); file_handle_pool_->close_handle(path); } diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index b684505a..551b234a 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -19,7 +19,8 @@ class FSArray final protected: std::mutex mutex_; size_t table_size_; - std::vector>> shard_futures_; + std::unordered_map shard_mutexes_; + std::unordered_map>> futures_; std::unordered_map> handles_; bool write_metadata_() override; diff --git a/tests/integration/stream-multiple-arrays-to-filesystem.cpp b/tests/integration/stream-multiple-arrays-to-filesystem.cpp index 56cdd967..dcd16d2e 100644 --- a/tests/integration/stream-multiple-arrays-to-filesystem.cpp +++ b/tests/integration/stream-multiple-arrays-to-filesystem.cpp @@ -287,8 +287,8 @@ verify_codecs(const nlohmann::json& metadata, EXPECT(codec_config.contains("index_location"), "Expected key 'index_location' in codec configuration"); auto index_location = codec_config["index_location"].get(); - EXPECT(index_location == "end", - "Expected index_location to be 'end', got '", + EXPECT(index_location == "start", + "Expected index_location to be 'start', got '", index_location, "'"); diff --git a/tests/unit-tests/array-write-ragged-internal-dim.cpp b/tests/unit-tests/array-write-ragged-internal-dim.cpp index c0cee78c..8266d310 100644 --- a/tests/unit-tests/array-write-ragged-internal-dim.cpp +++ b/tests/unit-tests/array-write-ragged-internal-dim.cpp @@ -99,7 +99,7 @@ main() try { auto thread_pool = std::make_shared( std::thread::hardware_concurrency(), - [](const std::string& err) { LOG_ERROR("Error: ", err.c_str()); }); + [](const std::string& err) { LOG_ERROR("Error: ", err); }); std::vector dims; dims.emplace_back("t", From 2e8947476ac7668bfdff2760838fe1f5f8b3cf73 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Fri, 24 Oct 2025 17:23:04 -0400 Subject: [PATCH 28/38] (wip) --- python/tests/test_stream.py | 30 ++--- src/streaming/array.cpp | 19 +-- src/streaming/array.hh | 11 +- src/streaming/fs.array.cpp | 116 +++++++++--------- src/streaming/fs.array.hh | 3 +- src/streaming/s3.array.cpp | 3 +- src/streaming/s3.array.hh | 9 +- .../array-write-ragged-append-dim.cpp | 34 ++--- .../unit-tests/zarr-stream-partial-append.cpp | 56 ++++++--- 9 files changed, 144 insertions(+), 137 deletions(-) diff --git a/python/tests/test_stream.py b/python/tests/test_stream.py index de3bc04c..66ab2e6d 100644 --- a/python/tests/test_stream.py +++ b/python/tests/test_stream.py @@ -378,7 +378,6 @@ def test_stream_data_to_filesystem( data[i, :, :] = i stream.append(data) - stream.close() # close the stream, flush the files chunk_size_bytes = data.dtype.itemsize @@ -394,12 +393,23 @@ def test_stream_data_to_filesystem( shard_size_bytes + table_size_bytes + 4 ) # 4 bytes for crc32c checksum + for x in range(settings.arrays[0].dimensions[-1].array_size_px): + for y in range(settings.arrays[0].dimensions[-2].array_size_px): + for z in range(settings.arrays[0].dimensions[-3].array_size_px): + shard_file = store_path / "test.zarr" / "0" / "c" / str(z) / str(y) / str(x) + assert shard_file.is_file() + if compression_codec is None: + assert shard_file.stat().st_size == shard_size_bytes + else: + size = shard_file.stat().st_size + assert table_size_bytes < size <= shard_size_bytes + group = zarr.open(settings.store_path, mode="r") array = group["0"] assert array.shape == data.shape for i in range(array.shape[0]): - assert np.array_equal(array[i, :, :], data[i, :, :]) + assert np.array_equal(array[i, :, :], data[i, :, :]), f"Data mismatch at index {i}" metadata = array.metadata sharding_codec = metadata.codecs[0] @@ -415,22 +425,8 @@ def test_stream_data_to_filesystem( assert blosc_codec.cname == cname assert blosc_codec.clevel == 1 assert blosc_codec.shuffle == zblosc.BloscShuffle.shuffle - - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).is_file() - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).stat().st_size <= shard_size_bytes else: - assert len(sharding_codec.codecs) == 1 - - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).is_file() - assert ( - store_path / "test.zarr" / "0" / "c" / "0" / "0" / "0" - ).stat().st_size == shard_size_bytes + assert len(sharding_codec.codecs) == 1 # bytes codec @pytest.mark.parametrize( diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 03745140..e2bf8467 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -135,17 +135,17 @@ zarr::Array::write_frame(std::vector& data) ++frames_written_; if (should_flush_layer_()) { - CHECK(compress_and_flush_data_()); + EXPECT(compress_and_flush_data_(), "Failed to flush chunk layer data"); + bytes_to_flush_ = 0; const auto& dims = config_->dimensions; const auto lps = dims->chunk_layers_per_shard(); current_layer_ = (current_layer_ + 1) % lps; if (should_rollover_()) { - close_shards_(); // also writes the shard tables + close_shards_(); CHECK(write_metadata_()); } - bytes_to_flush_ = 0; } return bytes_written; @@ -264,20 +264,14 @@ zarr::Array::close_() bool retval = false; is_closing_ = true; try { - const bool flush_tables = bytes_to_flush_ > 0 || current_layer_ > 0; - if (bytes_to_flush_ > 0) { if (!compress_and_flush_data_()) { LOG_ERROR("Failed to flush remaining data on close"); return false; } + bytes_to_flush_ = 0; } - - if (flush_tables && !flush_tables_()) { - LOG_ERROR("Failed to flush shard tables on close"); - return false; - } - close_io_streams_(); + finalize_io_streams_(); if (frames_written_ > 0) { CHECK(write_metadata_()); @@ -467,8 +461,7 @@ zarr::Array::close_shards_() { LOG_DEBUG("Rolling over"); - EXPECT(flush_tables_(), "Failed to flush shard tables during rollover"); - close_io_streams_(); + finalize_io_streams_(); // advance to the next shard index if (!is_closing_) { diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 7971e188..61ca5e6b 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -91,15 +91,10 @@ class Array : public ArrayBase [[nodiscard]] virtual bool compress_and_flush_data_() = 0; /** - * @brief Flush all shard tables to the underlying storage. - * @return True on success, false on failure. - */ - [[nodiscard]] virtual bool flush_tables_() = 0; - - /** - * @brief Close all open IO streams associated with this array. + * @brief Ensure all tables are flushed and close all open IO streams + * associated with this array. */ - virtual void close_io_streams_() = 0; + virtual void finalize_io_streams_() = 0; friend class MultiscaleArray; }; diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index 355ac35d..f652a0af 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -126,55 +126,10 @@ zarr::FSArray::write_metadata_() return success; } -bool -zarr::FSArray::flush_tables_() +std::string +zarr::FSArray::index_location_() const { - // construct paths to shard sinks if they don't already exist - if (data_paths_.empty()) { - make_data_paths_(); - } - - const auto& dims = config_->dimensions; - const auto n_shards = dims->number_of_shards(); - - for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { - const auto* shard_table = shard_tables_.data() + shard_idx; - - const size_t table_size = shard_table->size() * sizeof(uint64_t); - std::vector table(table_size + sizeof(uint32_t), 0); - - memcpy(table.data(), shard_table->data(), table_size); - - // compute crc32 checksum of the table - const uint32_t checksum = crc32c::Crc32c(table.data(), table_size); - memcpy(table.data() + table_size, &checksum, sizeof(uint32_t)); - - std::string data_path = data_paths_[shard_idx]; - - const auto handle = get_handle_(data_path); - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", data_path); - return false; - } - - if (!seek_and_write(handle.get(), 0, table)) { - LOG_ERROR("Failed to write table and checksum to shard ", - shard_idx, - " at path ", - data_path); - return false; - } - } - - // don't reset state if we're closing - if (!is_closing_) { - for (auto& table : shard_tables_) { - std::ranges::fill(table, std::numeric_limits::max()); - } - std::ranges::fill(shard_file_offsets_, table_size_); - } - - return true; + return "start"; } bool @@ -246,6 +201,7 @@ zarr::FSArray::compress_and_flush_data_() handle, data_path, bytes_per_px, + shard_table, internal_index, file_offset, shard_mutex, @@ -253,7 +209,7 @@ zarr::FSArray::compress_and_flush_data_() bool success = true; std::vector compressed; const uint8_t* data_out = nullptr; - size_t chunk_size_out = 0; + uint64_t chunk_size_out = 0; try { // compress here @@ -288,17 +244,38 @@ zarr::FSArray::compress_and_flush_data_() EXPECT(data_out != nullptr, err); EXPECT(chunk_size_out != 0, err); - size_t file_offset_local; + uint64_t file_offset_local; { std::lock_guard lock(*shard_mutex); file_offset_local = *file_offset; *file_offset += chunk_size_out; } + // write data success = seek_and_write(handle.get(), file_offset_local, std::span(data_out, chunk_size_out)); + EXPECT(success, + "Failed to write chunk data to ", + data_path, + " internal index ", + internal_index); + + // write table entry + const std::vector table_entry = { file_offset_local, + chunk_size_out }; + shard_table->at(2 * internal_index) = file_offset_local; + shard_table->at(2 * internal_index + 1) = chunk_size_out; + + const size_t table_entry_offset = + 2 * sizeof(uint64_t) * internal_index; + success = + seek_and_write(handle.get(), + table_entry_offset, + std::span(reinterpret_cast( + table_entry.data()), + sizeof(uint64_t) * 2)); } catch (const std::exception& exc) { err = "Failed to compress chunk " + std::to_string(internal_index) + " of shard at " + @@ -325,14 +302,43 @@ zarr::FSArray::compress_and_flush_data_() } void -zarr::FSArray::close_io_streams_() +zarr::FSArray::finalize_io_streams_() { - for (const auto& path : data_paths_) { + for (auto shard_idx = 0; shard_idx < data_paths_.size(); ++shard_idx) { + const auto& path = data_paths_[shard_idx]; for (auto& future : futures_[path]) { future.wait(); } futures_.erase(path); shard_mutexes_.erase(path); + + // compute table checksum and write it out + { + const auto handle = get_handle_(path); + EXPECT(handle != nullptr, + "Failed to get file handle for finalizing ", + path); + + auto& shard_table = shard_tables_[shard_idx]; + const size_t table_size = shard_table.size() * sizeof(uint64_t); + const auto* table_data = + reinterpret_cast(shard_table.data()); + const uint32_t checksum = crc32c::Crc32c(table_data, table_size); + + EXPECT(seek_and_write( + handle.get(), + table_size, + std::span{ reinterpret_cast(&checksum), + sizeof(uint32_t) }), + "Failed to write final checksum for shard at ", + path); + + std::ranges::fill(shard_table, + std::numeric_limits::max()); + shard_file_offsets_[shard_idx] = table_size_; + } + + handles_.erase(path); file_handle_pool_->close_handle(path); } @@ -354,9 +360,3 @@ zarr::FSArray::get_handle_(const std::string& path) handles_.emplace(path, handle); return handle; } - -std::string -zarr::FSArray::index_location_() const -{ - return "start"; -} diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 551b234a..9182e6fb 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -26,8 +26,7 @@ class FSArray final bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; - bool flush_tables_() override; - void close_io_streams_() override; + void finalize_io_streams_() override; /** * @brief Get a file handle for the given path, creating it and adding it to diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index 398d9d2c..d5a95b97 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -65,8 +65,9 @@ zarr::S3Array::compress_and_flush_data_() } void -zarr::S3Array::close_io_streams_() +zarr::S3Array::finalize_io_streams_() { + const bool flush_tables = bytes_to_flush_ > 0 || current_layer_ > 0; for (const auto& key : data_paths_) { EXPECT(finalize_object(key), "Failed to finalize S3 object at ", key); } diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index f12cb962..60988b3a 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -17,8 +17,7 @@ class S3Array final bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; - bool flush_tables_() override; - void close_io_streams_() override; + void finalize_io_streams_() override; /** * @brief Compress all the chunk buffers in place. @@ -44,5 +43,11 @@ class S3Array final * @return True on success, false on failure. */ bool flush_data_(); + + /** + * @brief Flush all shard tables to S3. + * @return True on success, false on failure. + */ + bool flush_tables_(); }; } // namespace zarr \ No newline at end of file diff --git a/tests/unit-tests/array-write-ragged-append-dim.cpp b/tests/unit-tests/array-write-ragged-append-dim.cpp index 3d9171d3..f32ce514 100644 --- a/tests/unit-tests/array-write-ragged-append-dim.cpp +++ b/tests/unit-tests/array-write-ragged-append-dim.cpp @@ -18,7 +18,8 @@ constexpr unsigned int n_frames = array_planes; constexpr unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2; constexpr unsigned int shard_width = 2, shard_height = 1, shard_planes = 1; -constexpr unsigned int chunks_per_shard = shard_width * shard_height * shard_planes; +constexpr unsigned int chunks_per_shard = + shard_width * shard_height * shard_planes; constexpr unsigned int chunks_in_x = (array_width + chunk_width - 1) / chunk_width; // 4 chunks @@ -79,12 +80,12 @@ main() int retval = 1; - const ZarrDataType dtype = ZarrDataType_int32; + constexpr ZarrDataType dtype = ZarrDataType_int32; const unsigned int nbytes_px = zarr::bytes_of_type(dtype); try { auto thread_pool = std::make_shared( - std::thread::hardware_concurrency(), + 1, [](const std::string& err) { LOG_ERROR("Error: ", err.c_str()); }); std::vector dims; @@ -101,17 +102,16 @@ main() dims.emplace_back( "x", ZarrDimensionType_Space, array_width, chunk_width, shard_width); - auto config = std::make_shared( - base_dir.string(), - "", - std::nullopt, - std::nullopt, - std::make_shared(std::move(dims), dtype), - dtype, - std::nullopt, - 4); - { + auto config = std::make_shared( + base_dir.string(), + "", + std::nullopt, + std::nullopt, + std::make_shared(std::move(dims), dtype), + dtype, + std::nullopt, + 4); auto writer = std::make_unique( config, thread_pool, std::make_shared()); @@ -129,10 +129,10 @@ main() const auto chunk_size = chunk_width * chunk_height * chunk_planes * nbytes_px; - const auto index_size = chunks_per_shard * - sizeof(uint64_t) * // indices are 64 bits - 2; // 2 indices per chunk - const auto checksum_size = 4; // CRC32 checksum + constexpr size_t index_size = chunks_per_shard * + sizeof(uint64_t) * // indices are 64 bits + 2; // 2 indices per chunk + constexpr size_t checksum_size = 4; // CRC32 checksum const auto expected_file_size = shard_width * shard_height * shard_planes * chunk_size + index_size + checksum_size; diff --git a/tests/unit-tests/zarr-stream-partial-append.cpp b/tests/unit-tests/zarr-stream-partial-append.cpp index b89a84c2..b11be3e3 100644 --- a/tests/unit-tests/zarr-stream-partial-append.cpp +++ b/tests/unit-tests/zarr-stream-partial-append.cpp @@ -48,6 +48,9 @@ verify_file_data(const ZarrStreamSettings& settings) const size_t row_size = settings.arrays->dimensions[2].array_size_px, num_rows = settings.arrays->dimensions[1].array_size_px; + constexpr size_t table_size = 2 * sizeof(uint64_t) + 4; + const size_t chunk_size = row_size * num_rows; + fs::path shard_path = fs::path(settings.store_path) / "0" / "c" / "0" / "0" / "0"; CHECK(fs::is_regular_file(shard_path)); @@ -60,17 +63,20 @@ verify_file_data(const ZarrStreamSettings& settings) // Get file size file.seekg(0, std::ios::end); const auto file_size = file.tellg(); - file.seekg(0, std::ios::beg); + EXPECT(file_size == + static_cast(chunk_size + table_size), + "Unexpected file size: ", + file_size); + file.seekg(table_size, std::ios::beg); // skip table header // Read entire file into buffer - buffer.resize(file_size); - file.read(reinterpret_cast(buffer.data()), file_size); + buffer.resize(chunk_size); + file.read(reinterpret_cast(buffer.data()), chunk_size); CHECK(file.good()); } // Verify each row contains the correct values - constexpr size_t table_size = 2 * sizeof(uint64_t) + 4; - EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); + EXPECT_EQ(int, buffer.size(), chunk_size); for (size_t row = 0; row < num_rows; ++row) { // Check each byte in this row for (size_t col = 0; col < row_size; ++col) { @@ -90,16 +96,20 @@ verify_file_data(const ZarrStreamSettings& settings) // Get file size file.seekg(0, std::ios::end); const auto file_size = file.tellg(); - file.seekg(0, std::ios::beg); + EXPECT(file_size == + static_cast(chunk_size + table_size), + "Unexpected file size: ", + file_size); + file.seekg(table_size, std::ios::beg); // skip table header // Read entire file into buffer - buffer.resize(file_size); - file.read(reinterpret_cast(buffer.data()), file_size); + buffer.resize(chunk_size); + file.read(reinterpret_cast(buffer.data()), chunk_size); CHECK(file.good()); } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); + EXPECT_EQ(int, buffer.size(), chunk_size); for (size_t row = 0; row < num_rows; ++row) { // Check each byte in this row for (size_t col = 0; col < row_size; ++col) { @@ -124,18 +134,22 @@ verify_file_data(const ZarrStreamSettings& settings) // Get file size file.seekg(0, std::ios::end); const auto file_size = file.tellg(); - file.seekg(0, std::ios::beg); + EXPECT(file_size == + static_cast(chunk_size + table_size), + "Unexpected file size: ", + file_size); + file.seekg(table_size, std::ios::beg); // skip table header // Read entire file into buffer - buffer.resize(file_size); - file.read(reinterpret_cast(buffer.data()), file_size); + buffer.resize(chunk_size); + file.read(reinterpret_cast(buffer.data()), chunk_size); CHECK(file.good()); } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); + EXPECT_EQ(int, buffer.size(), chunk_size); - for (auto i = 0; i < row_size * num_rows; ++i) { + for (auto i = 0; i < chunk_size; ++i) { EXPECT_EQ(int, buffer[i], px_value++); } @@ -150,18 +164,22 @@ verify_file_data(const ZarrStreamSettings& settings) // Get file size file.seekg(0, std::ios::end); const auto file_size = file.tellg(); - file.seekg(0, std::ios::beg); + EXPECT(file_size == + static_cast(chunk_size + table_size), + "Unexpected file size: ", + file_size); + file.seekg(table_size, std::ios::beg); // skip table header // Read entire file into buffer - buffer.resize(file_size); - file.read(reinterpret_cast(buffer.data()), file_size); + buffer.resize(chunk_size); + file.read(reinterpret_cast(buffer.data()), chunk_size); CHECK(file.good()); } // Verify each row contains the correct values - EXPECT_EQ(int, buffer.size(), row_size* num_rows + table_size); + EXPECT_EQ(int, buffer.size(), chunk_size); - for (auto i = 0; i < row_size * num_rows; ++i) { + for (auto i = 0; i < chunk_size; ++i) { EXPECT_EQ(int, buffer[i], px_value++); } } From 9990b6a26e6ef91989c905c51dce87c48f4d6b14 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Mon, 27 Oct 2025 15:55:27 -0400 Subject: [PATCH 29/38] (wip) remove unused FSStorage::write_binary --- src/streaming/fs.storage.cpp | 22 ---------------------- src/streaming/fs.storage.hh | 11 ----------- 2 files changed, 33 deletions(-) diff --git a/src/streaming/fs.storage.cpp b/src/streaming/fs.storage.cpp index c229d30e..7f404cd2 100644 --- a/src/streaming/fs.storage.cpp +++ b/src/streaming/fs.storage.cpp @@ -17,28 +17,6 @@ zarr::FSStorage::FSStorage(std::shared_ptr file_handle_pool) { } -bool -zarr::FSStorage::write_binary(const std::string& path, - const std::vector& data, - size_t offset) const -{ - void* flags = make_flags(); - const auto handle = file_handle_pool_->get_handle(path, flags); - destroy_flags(flags); - - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", path); - return false; - } - - if (!seek_and_write(handle.get(), offset, data)) { - LOG_ERROR("Failed to write binary data to ", path); - return false; - } - - return true; -} - bool zarr::FSStorage::write_string(const std::string& path, const std::string& data, diff --git a/src/streaming/fs.storage.hh b/src/streaming/fs.storage.hh index 1168694b..d2b8637b 100644 --- a/src/streaming/fs.storage.hh +++ b/src/streaming/fs.storage.hh @@ -12,17 +12,6 @@ class FSStorage explicit FSStorage(std::shared_ptr file_handle_pool); virtual ~FSStorage() = default; - /** - * @brief Write binary data to a path at the given offset. - * @param path The path to write to. - * @param data The data to write. - * @param offset The offset to write at. - * @return True if the write was successful, false otherwise. - */ - [[nodiscard]] bool write_binary(const std::string& path, - const std::vector& data, - size_t offset) const; - /** * @brief Write a string to a path at the given offset. * @param path The path to write to. From 21803aeb71c310cac719ac029dbce598eb4077bc Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Mon, 27 Oct 2025 16:58:20 -0400 Subject: [PATCH 30/38] Log a more useful error message in zarr-stream-partial-append.cpp --- tests/unit-tests/zarr-stream-partial-append.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/unit-tests/zarr-stream-partial-append.cpp b/tests/unit-tests/zarr-stream-partial-append.cpp index b11be3e3..8b233d1b 100644 --- a/tests/unit-tests/zarr-stream-partial-append.cpp +++ b/tests/unit-tests/zarr-stream-partial-append.cpp @@ -81,7 +81,13 @@ verify_file_data(const ZarrStreamSettings& settings) // Check each byte in this row for (size_t col = 0; col < row_size; ++col) { const size_t index = row * row_size + col; - EXPECT_EQ(int, buffer[index], row); + EXPECT(buffer[index] == row, + "Unexpected value at row ", + row, + " col ", + col, + ": ", + static_cast(buffer[index])); } } From 2239435db49a8db6eb978312312c8c55c8c77315 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Mon, 27 Oct 2025 17:02:34 -0400 Subject: [PATCH 31/38] Write FSArray shard tables in one go (don't miss unwritten chunks) --- src/streaming/fs.array.cpp | 69 ++++++++++++++++++-------------------- src/streaming/fs.array.hh | 6 ++++ 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index f652a0af..e54ff4fa 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -263,19 +263,8 @@ zarr::FSArray::compress_and_flush_data_() internal_index); // write table entry - const std::vector table_entry = { file_offset_local, - chunk_size_out }; shard_table->at(2 * internal_index) = file_offset_local; shard_table->at(2 * internal_index + 1) = chunk_size_out; - - const size_t table_entry_offset = - 2 * sizeof(uint64_t) * internal_index; - success = - seek_and_write(handle.get(), - table_entry_offset, - std::span(reinterpret_cast( - table_entry.data()), - sizeof(uint64_t) * 2)); } catch (const std::exception& exc) { err = "Failed to compress chunk " + std::to_string(internal_index) + " of shard at " + @@ -312,31 +301,7 @@ zarr::FSArray::finalize_io_streams_() futures_.erase(path); shard_mutexes_.erase(path); - // compute table checksum and write it out - { - const auto handle = get_handle_(path); - EXPECT(handle != nullptr, - "Failed to get file handle for finalizing ", - path); - - auto& shard_table = shard_tables_[shard_idx]; - const size_t table_size = shard_table.size() * sizeof(uint64_t); - const auto* table_data = - reinterpret_cast(shard_table.data()); - const uint32_t checksum = crc32c::Crc32c(table_data, table_size); - - EXPECT(seek_and_write( - handle.get(), - table_size, - std::span{ reinterpret_cast(&checksum), - sizeof(uint32_t) }), - "Failed to write final checksum for shard at ", - path); - - std::ranges::fill(shard_table, - std::numeric_limits::max()); - shard_file_offsets_[shard_idx] = table_size_; - } + write_table_entries_(shard_idx); handles_.erase(path); file_handle_pool_->close_handle(path); @@ -360,3 +325,35 @@ zarr::FSArray::get_handle_(const std::string& path) handles_.emplace(path, handle); return handle; } + +void +zarr::FSArray::write_table_entries_(uint32_t shard_idx) +{ + CHECK(shard_idx < shard_tables_.size()); + const auto& path = data_paths_[shard_idx]; + const auto handle = get_handle_(path); + + EXPECT( + handle != nullptr, "Failed to get file handle for finalizing ", path); + + // compute table checksum and write it out + auto& shard_table = shard_tables_[shard_idx]; + const size_t table_size = shard_table.size() * sizeof(uint64_t); + const auto* table_data = + reinterpret_cast(shard_table.data()); + const uint32_t checksum = crc32c::Crc32c(table_data, table_size); + + const size_t table_buffer_size = shard_table.size() * sizeof(uint64_t); + constexpr size_t checksum_size = sizeof(uint32_t); + + std::vector table_buffer(table_buffer_size + checksum_size); + memcpy(table_buffer.data(), table_data, table_size); + memcpy(table_buffer.data() + table_buffer_size, &checksum, checksum_size); + + EXPECT(seek_and_write(handle.get(), 0, table_buffer), + "Failed to write final checksum for shard at ", + path); + + std::ranges::fill(shard_table, std::numeric_limits::max()); + shard_file_offsets_[shard_idx] = table_size_; +} diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 9182e6fb..7604d0a5 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -35,5 +35,11 @@ class FSArray final * @return The file handle. */ std::shared_ptr get_handle_(const std::string& path); + + /** + * @brief Write the shard table entries for the given shard index. + * @param shard_idx The shard index. + */ + void write_table_entries_(uint32_t shard_idx); }; } // namespace zarr \ No newline at end of file From fdd348e5e6a1e3fe9774931c32d5ab074cf71a16 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Wed, 29 Oct 2025 13:36:56 -0400 Subject: [PATCH 32/38] Fix S3Array table flushing --- src/streaming/s3.array.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index d5a95b97..d8ff0666 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -67,7 +67,15 @@ zarr::S3Array::compress_and_flush_data_() void zarr::S3Array::finalize_io_streams_() { - const bool flush_tables = bytes_to_flush_ > 0 || current_layer_ > 0; + const bool should_rollover = should_rollover_(); + const bool should_flush_anyway = + is_closing_ && + frames_written_ % config_->dimensions->frames_per_layer() != 0; + + if (should_rollover || should_flush_anyway) { + flush_tables_(); + } + for (const auto& key : data_paths_) { EXPECT(finalize_object(key), "Failed to finalize S3 object at ", key); } From f904073cd015c2cbfec23e153b2d76d659b168f0 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 10:14:31 -0400 Subject: [PATCH 33/38] Move `ShardLayer` to `S3Array` --- src/streaming/array.hh | 6 ------ src/streaming/s3.array.cpp | 2 +- src/streaming/s3.array.hh | 6 ++++++ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 61ca5e6b..4e321e09 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -17,12 +17,6 @@ class Array : public ArrayBase [[nodiscard]] size_t write_frame(std::vector&) override; protected: - struct ShardLayer - { - size_t offset; // offset in bytes from start of shard - std::vector> chunks; - }; - std::vector> chunk_buffers_; std::vector data_paths_; diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index d8ff0666..83635c1f 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -197,7 +197,7 @@ zarr::S3Array::update_table_entries_() } } -zarr::Array::ShardLayer +zarr::S3Array::ShardLayer zarr::S3Array::collect_chunks_(uint32_t shard_index) { const auto& dims = config_->dimensions; diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index 60988b3a..45ea6907 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -14,6 +14,12 @@ class S3Array final std::shared_ptr s3_connection_pool); protected: + struct ShardLayer + { + size_t offset; // offset in bytes from start of shard + std::vector> chunks; + }; + bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; From 0f89a9ce77d6e121678195ad435a3ee851d893de Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 10:22:18 -0400 Subject: [PATCH 34/38] Rename `Array::finalize_io_streams_` to `Array::finalize_append_shard_` --- src/streaming/array.cpp | 4 ++-- src/streaming/array.hh | 2 +- src/streaming/fs.array.cpp | 2 +- src/streaming/fs.array.hh | 2 +- src/streaming/s3.array.cpp | 2 +- src/streaming/s3.array.hh | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index e2bf8467..8efd3d0f 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -271,7 +271,7 @@ zarr::Array::close_() } bytes_to_flush_ = 0; } - finalize_io_streams_(); + finalize_append_shard_(); if (frames_written_ > 0) { CHECK(write_metadata_()); @@ -461,7 +461,7 @@ zarr::Array::close_shards_() { LOG_DEBUG("Rolling over"); - finalize_io_streams_(); + finalize_append_shard_(); // advance to the next shard index if (!is_closing_) { diff --git a/src/streaming/array.hh b/src/streaming/array.hh index 4e321e09..cf3bc576 100644 --- a/src/streaming/array.hh +++ b/src/streaming/array.hh @@ -88,7 +88,7 @@ class Array : public ArrayBase * @brief Ensure all tables are flushed and close all open IO streams * associated with this array. */ - virtual void finalize_io_streams_() = 0; + virtual void finalize_append_shard_() = 0; friend class MultiscaleArray; }; diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index e54ff4fa..251a0828 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -291,7 +291,7 @@ zarr::FSArray::compress_and_flush_data_() } void -zarr::FSArray::finalize_io_streams_() +zarr::FSArray::finalize_append_shard_() { for (auto shard_idx = 0; shard_idx < data_paths_.size(); ++shard_idx) { const auto& path = data_paths_[shard_idx]; diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 7604d0a5..72816e17 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -26,7 +26,7 @@ class FSArray final bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; - void finalize_io_streams_() override; + void finalize_append_shard_() override; /** * @brief Get a file handle for the given path, creating it and adding it to diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index 83635c1f..3b55d916 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -65,7 +65,7 @@ zarr::S3Array::compress_and_flush_data_() } void -zarr::S3Array::finalize_io_streams_() +zarr::S3Array::finalize_append_shard_() { const bool should_rollover = should_rollover_(); const bool should_flush_anyway = diff --git a/src/streaming/s3.array.hh b/src/streaming/s3.array.hh index 45ea6907..92970284 100644 --- a/src/streaming/s3.array.hh +++ b/src/streaming/s3.array.hh @@ -23,7 +23,7 @@ class S3Array final bool write_metadata_() override; std::string index_location_() const override; bool compress_and_flush_data_() override; - void finalize_io_streams_() override; + void finalize_append_shard_() override; /** * @brief Compress all the chunk buffers in place. From 15efd33a2b5ef78f8511c4b27a02d226736a3211 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 11:26:29 -0400 Subject: [PATCH 35/38] (wip): allow thread creation from other threads --- src/streaming/thread.pool.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/streaming/thread.pool.cpp b/src/streaming/thread.pool.cpp index ef50f642..d934d961 100644 --- a/src/streaming/thread.pool.cpp +++ b/src/streaming/thread.pool.cpp @@ -34,8 +34,7 @@ bool zarr::ThreadPool::push_job(Task&& job) { std::unique_lock lock(jobs_mutex_); - // only allow pushing jobs from the main thread - if (!accepting_jobs || std::this_thread::get_id() != main_thread_id_) { + if (!accepting_jobs /*|| std::this_thread::get_id() != main_thread_id_*/) { return false; } From aac1154cd7f0fb6d94695ca3f12380c975a06a1a Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 11:33:39 -0400 Subject: [PATCH 36/38] (wip): collect shard file data in structs, wait to close shard files until the close --- src/streaming/fs.array.cpp | 187 ++++++++++++++++++++++++------------- src/streaming/fs.array.hh | 24 ++++- 2 files changed, 144 insertions(+), 67 deletions(-) diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index 251a0828..be6d0299 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -7,6 +7,7 @@ #include // memcp #include +#include #include #include @@ -93,15 +94,45 @@ make_dirs(const std::vector& dir_paths, } } // namespace +bool +zarr::FSArray::ShardFile::close() +{ + // finish writing chunks + for (auto& future : chunk_futures) { + future.wait(); + } + chunk_futures.clear(); + + // compute table checksum and write it out + const size_t table_size = table.size() * sizeof(uint64_t); + const auto* table_data = reinterpret_cast(table.data()); + const uint32_t checksum = crc32c::Crc32c(table_data, table_size); + + const size_t table_buffer_size = table.size() * sizeof(uint64_t); + constexpr size_t checksum_size = sizeof(uint32_t); + + std::vector table_buffer(table_buffer_size + checksum_size); + memcpy(table_buffer.data(), table_data, table_size); + memcpy(table_buffer.data() + table_buffer_size, &checksum, checksum_size); + + if (!seek_and_write(handle.get(), 0, table_buffer)) { + LOG_ERROR("Failed to write table and checksum for shard at ", path); + return false; + } + + return true; +} + zarr::FSArray::FSArray(std::shared_ptr config, std::shared_ptr thread_pool, std::shared_ptr file_handle_pool) : Array(config, thread_pool) , FSStorage(file_handle_pool) + , table_size_bytes_(config->dimensions->chunks_per_shard() * 2 * + sizeof(uint64_t) + + sizeof(uint32_t)) { - table_size_ = - config_->dimensions->chunks_per_shard() * 2 * sizeof(uint64_t) + 4; - std::ranges::fill(shard_file_offsets_, table_size_); + std::ranges::fill(shard_file_offsets_, table_size_bytes_); } bool @@ -135,21 +166,33 @@ zarr::FSArray::index_location_() const bool zarr::FSArray::compress_and_flush_data_() { + const auto& dims = config_->dimensions; + const uint32_t chunks_per_shard = dims->chunks_per_shard(); + const auto n_shards = dims->number_of_shards(); + // construct paths to shard sinks if they don't already exist if (data_paths_.empty()) { make_data_paths_(); + CHECK(data_paths_.size() == n_shards); + + // create parent directories if needed + const auto parent_paths = get_parent_paths(data_paths_); + CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist + + // create shard files + std::unique_lock lock(shard_files_mutex_); + for (const auto& path : data_paths_) { + auto shard_file = std::make_shared(); + shard_file->path = path; + shard_file->handle = get_handle_(path); + shard_file->table = std::vector( + 2 * chunks_per_shard, std::numeric_limits::max()); + shard_file->file_offset = table_size_bytes_; + + shard_files_[path] = std::move(shard_file); + } } - // create parent directories if needed - const auto parent_paths = get_parent_paths(data_paths_); - CHECK(make_dirs(parent_paths, thread_pool_)); // no-op if they exist - - const auto& dims = config_->dimensions; - - const auto n_shards = dims->number_of_shards(); - CHECK(data_paths_.size() == n_shards); - - const uint32_t chunks_per_shard = dims->chunks_per_shard(); const uint32_t chunks_in_mem = dims->number_of_chunks_in_memory(); const uint32_t n_layers = dims->chunk_layers_per_shard(); const uint32_t chunks_per_layer = chunks_per_shard / n_layers; @@ -164,47 +207,32 @@ zarr::FSArray::compress_and_flush_data_() for (auto shard_idx = 0; shard_idx < n_shards; ++shard_idx) { const std::string data_path = data_paths_[shard_idx]; + auto shard_file = shard_files_[data_path]; // chunk storage is at chunk_index - chunk_offset const auto chunk_indices_this_layer = dims->chunk_indices_for_shard_layer(shard_idx, current_layer_); - auto* shard_table = shard_tables_.data() + shard_idx; - auto* file_offset = shard_file_offsets_.data() + shard_idx; - - if (!shard_mutexes_.contains(data_path)) { - shard_mutexes_.emplace(); - } - auto* shard_mutex = &shard_mutexes_[data_path]; - - auto handle = get_handle_(data_path); - if (handle == nullptr) { - LOG_ERROR("Failed to get file handle for ", data_path); - return false; - } - - if (!futures_.contains(data_path)) { - futures_.emplace(data_path, std::vector>{}); - } const auto& params = config_->compression_params; for (auto i = 0; i < chunk_indices_this_layer.size(); ++i) { const uint32_t chunk_idx = chunk_indices_this_layer[i]; CHECK(chunk_idx >= chunk_offset); uint32_t internal_index = dims->shard_internal_index(chunk_idx); - const auto& chunk_data = chunk_buffers_[chunk_idx - chunk_offset]; - auto promise = std::make_shared>(); - futures_[data_path].push_back(promise->get_future()); + auto promise = + std::make_shared>(); // TODO (not a shared + // pointer and std::move?) + + auto& chunk_data = chunk_buffers_[chunk_idx - chunk_offset]; + const size_t bytes_of_chunk = chunk_data.size(); + + shard_file->chunk_futures.push_back(promise->get_future()); - auto job = [&chunk_data, + auto job = [chunk_data = std::move(chunk_data), ¶ms, - handle, - data_path, + shard_file, bytes_per_px, - shard_table, internal_index, - file_offset, - shard_mutex, promise](std::string& err) { bool success = true; std::vector compressed; @@ -246,29 +274,29 @@ zarr::FSArray::compress_and_flush_data_() uint64_t file_offset_local; { - std::lock_guard lock(*shard_mutex); - file_offset_local = *file_offset; - *file_offset += chunk_size_out; + std::lock_guard lock(shard_file->offset_mutex); + file_offset_local = shard_file->file_offset; + shard_file->file_offset += chunk_size_out; } // write data success = - seek_and_write(handle.get(), + seek_and_write(shard_file->handle.get(), file_offset_local, std::span(data_out, chunk_size_out)); EXPECT(success, "Failed to write chunk data to ", - data_path, + shard_file->path, " internal index ", internal_index); // write table entry - shard_table->at(2 * internal_index) = file_offset_local; - shard_table->at(2 * internal_index + 1) = chunk_size_out; + shard_file->table[2 * internal_index] = file_offset_local; + shard_file->table[2 * internal_index + 1] = chunk_size_out; } catch (const std::exception& exc) { - err = "Failed to compress chunk " + + err = "Failed to write chunk " + std::to_string(internal_index) + " of shard at " + - data_path + ": " + exc.what(); + shard_file->path + ": " + exc.what(); success = false; } @@ -284,7 +312,41 @@ zarr::FSArray::compress_and_flush_data_() LOG_ERROR(err); } } + + if (!is_closing_) { + chunk_buffers_[chunk_idx - chunk_offset].resize(bytes_of_chunk); + } } + + // if we're about to roll over to a new append shard, signal that we're + // not going to add any more chunks and that we can wait to close + // if (current_layer_ == n_layers - 1) { + // auto job = [shard_file, this](std::string& err) -> bool { + // bool success; + // + // try { + // success = shard_file->close(); + // std::unique_lock lock(shard_files_mutex_); + // shard_files_.erase(shard_file->path); + // file_handle_pool_->close_handle(shard_file->path); + // shard_files_cv_.notify_all(); + // } catch (const std::exception& exc) { + // err = exc.what(); + // success = false; + // } + // + // return success; + // }; + // + // // one thread is reserved for processing the frame queue and runs + // // the entire lifetime of the stream + // if (thread_pool_->n_threads() == 1 || + // !thread_pool_->push_job(job)) { + // if (std::string err; !job(err)) { + // LOG_ERROR(err); + // } + // } + // } } return true; @@ -293,27 +355,26 @@ zarr::FSArray::compress_and_flush_data_() void zarr::FSArray::finalize_append_shard_() { - for (auto shard_idx = 0; shard_idx < data_paths_.size(); ++shard_idx) { - const auto& path = data_paths_[shard_idx]; - for (auto& future : futures_[path]) { - future.wait(); - } - futures_.erase(path); - shard_mutexes_.erase(path); - - write_table_entries_(shard_idx); + data_paths_.clear(); - handles_.erase(path); - file_handle_pool_->close_handle(path); + if (is_closing_) { + // close all shards + for (auto& shard_file : shard_files_ | std::views::values) { + EXPECT(shard_file->close(), + "Failed to close shard file at path ", + shard_file->path); + } + shard_files_.clear(); + // wait on all the shards to be written out + // std::unique_lock lock(shard_files_mutex_); + // shard_files_cv_.wait(lock, [this] { return shard_files_.empty(); }); } - - data_paths_.clear(); } std::shared_ptr zarr::FSArray::get_handle_(const std::string& path) { - std::unique_lock lock(mutex_); + std::unique_lock lock(handles_mutex_); if (const auto it = handles_.find(path); it != handles_.end()) { return it->second; } @@ -355,5 +416,5 @@ zarr::FSArray::write_table_entries_(uint32_t shard_idx) path); std::ranges::fill(shard_table, std::numeric_limits::max()); - shard_file_offsets_[shard_idx] = table_size_; + shard_file_offsets_[shard_idx] = table_size_bytes_; } diff --git a/src/streaming/fs.array.hh b/src/streaming/fs.array.hh index 72816e17..a18e50bf 100644 --- a/src/streaming/fs.array.hh +++ b/src/streaming/fs.array.hh @@ -17,11 +17,27 @@ class FSArray final std::shared_ptr file_handle_pool); protected: - std::mutex mutex_; - size_t table_size_; - std::unordered_map shard_mutexes_; - std::unordered_map>> futures_; + struct ShardFile + { + std::string path; + std::shared_ptr handle; + std::vector table; + std::mutex table_mutex; + uint64_t file_offset; + std::mutex offset_mutex; + std::vector> chunk_futures; + + [[nodiscard]] bool close(); + }; + + const size_t table_size_bytes_; + + std::unordered_map> shard_files_; + std::mutex shard_files_mutex_; + std::condition_variable shard_files_cv_; + std::unordered_map> handles_; + std::mutex handles_mutex_; bool write_metadata_() override; std::string index_location_() const override; From 60f2b5bdf3cc073f060dc7f1f705f967d4a07f61 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 11:38:30 -0400 Subject: [PATCH 37/38] (wip): update stream-raw-to-filesystem to match benchmark.py config --- .../integration/stream-raw-to-filesystem.cpp | 248 +++++++----------- 1 file changed, 89 insertions(+), 159 deletions(-) diff --git a/tests/integration/stream-raw-to-filesystem.cpp b/tests/integration/stream-raw-to-filesystem.cpp index 974cddcd..e6f9fd86 100644 --- a/tests/integration/stream-raw-to-filesystem.cpp +++ b/tests/integration/stream-raw-to-filesystem.cpp @@ -13,44 +13,31 @@ namespace { const std::string test_path = (fs::temp_directory_path() / (TEST ".zarr")).string(); -const unsigned int array_width = 64, array_height = 48, array_planes = 6, - array_channels = 8, array_timepoints = 10; - -const unsigned int chunk_width = 16, chunk_height = 16, chunk_planes = 2, - chunk_channels = 4, chunk_timepoints = 5; - -const unsigned int shard_width = 2, shard_height = 1, shard_planes = 1, - shard_channels = 2, shard_timepoints = 2; -const unsigned int chunks_per_shard = - shard_width * shard_height * shard_planes * shard_channels * shard_timepoints; - -const unsigned int chunks_in_x = +constexpr unsigned int array_width = 2048, array_height = 2048, + array_planes = 1024; +constexpr unsigned int chunk_width = 64, chunk_height = 64, chunk_planes = 64; +constexpr unsigned int shard_width = 16, shard_height = 16, shard_planes = 1; +constexpr unsigned int chunks_per_shard = + shard_width * shard_height * shard_planes; + +constexpr unsigned int chunks_in_x = (array_width + chunk_width - 1) / chunk_width; // 4 chunks -const unsigned int chunks_in_y = +constexpr unsigned int chunks_in_y = (array_height + chunk_height - 1) / chunk_height; // 3 chunks -const unsigned int chunks_in_z = +constexpr unsigned int chunks_in_z = (array_planes + chunk_planes - 1) / chunk_planes; // 3 chunks -const unsigned int chunks_in_c = - (array_channels + chunk_channels - 1) / chunk_channels; // 2 chunks -const unsigned int chunks_in_t = - (array_timepoints + chunk_timepoints - 1) / chunk_timepoints; -const unsigned int shards_in_x = +constexpr unsigned int shards_in_x = (chunks_in_x + shard_width - 1) / shard_width; // 2 shards -const unsigned int shards_in_y = +constexpr unsigned int shards_in_y = (chunks_in_y + shard_height - 1) / shard_height; // 3 shards -const unsigned int shards_in_z = +constexpr unsigned int shards_in_z = (chunks_in_z + shard_planes - 1) / shard_planes; // 3 shards -const unsigned int shards_in_c = - (chunks_in_c + shard_channels - 1) / shard_channels; // 1 shard -const unsigned int shards_in_t = - (chunks_in_t + shard_timepoints - 1) / shard_timepoints; // 1 shard -const size_t nbytes_px = sizeof(uint16_t); -const uint32_t frames_to_acquire = - array_planes * array_channels * array_timepoints; -const size_t bytes_of_frame = array_width * array_height * nbytes_px; -} // namespace/s +constexpr size_t nbytes_px = sizeof(uint16_t); +constexpr uint32_t frames_to_acquire = array_planes; +constexpr size_t bytes_of_frame = array_width * array_height * nbytes_px; +} // namespace ZarrStream* setup() @@ -67,27 +54,9 @@ setup() .array_count = 1, }; - CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 5)); + CHECK_OK(ZarrArraySettings_create_dimension_array(settings.arrays, 3)); ZarrDimensionProperties* dim = settings.arrays->dimensions; - *dim = DIM("t", - ZarrDimensionType_Time, - array_timepoints, - chunk_timepoints, - shard_timepoints, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 1; - *dim = DIM("c", - ZarrDimensionType_Channel, - array_channels, - chunk_channels, - shard_channels, - nullptr, - 1.0); - - dim = settings.arrays->dimensions + 2; *dim = DIM("z", ZarrDimensionType_Space, array_planes, @@ -95,8 +64,8 @@ setup() shard_planes, "millimeter", 1.4); + ++dim; - dim = settings.arrays->dimensions + 3; *dim = DIM("y", ZarrDimensionType_Space, array_height, @@ -104,8 +73,8 @@ setup() shard_height, "micrometer", 0.9); + ++dim; - dim = settings.arrays->dimensions + 4; *dim = DIM("x", ZarrDimensionType_Space, array_width, @@ -142,29 +111,12 @@ verify_group_metadata(const nlohmann::json& meta) "'"); const auto axes = multiscales["axes"]; - EXPECT_EQ(size_t, axes.size(), 5); + EXPECT_EQ(size_t, axes.size(), 3); std::string name, type, unit; name = axes[0]["name"]; type = axes[0]["type"]; - EXPECT(name == "t", "Expected name to be 't', but got '", name, "'"); - EXPECT(type == "time", "Expected type to be 'time', but got '", type, "'"); - EXPECT(!axes[0].contains("unit"), - "Expected unit to be missing, got ", - axes[0]["unit"].get()); - - name = axes[1]["name"]; - type = axes[1]["type"]; - EXPECT(name == "c", "Expected name to be 'c', but got '", name, "'"); - EXPECT( - type == "channel", "Expected type to be 'channel', but got '", type, "'"); - EXPECT(!axes[1].contains("unit"), - "Expected unit to be missing, got ", - axes[1]["unit"].get()); - - name = axes[2]["name"]; - type = axes[2]["type"]; - unit = axes[2]["unit"]; + unit = axes[0]["unit"]; EXPECT(name == "z", "Expected name to be 'z', but got '", name, "'"); EXPECT( type == "space", "Expected type to be 'space', but got '", type, "'"); @@ -173,9 +125,9 @@ verify_group_metadata(const nlohmann::json& meta) unit, "'"); - name = axes[3]["name"]; - type = axes[3]["type"]; - unit = axes[3]["unit"]; + name = axes[1]["name"]; + type = axes[1]["type"]; + unit = axes[1]["unit"]; EXPECT(name == "y", "Expected name to be 'y', but got '", name, "'"); EXPECT( type == "space", "Expected type to be 'space', but got '", type, "'"); @@ -184,9 +136,9 @@ verify_group_metadata(const nlohmann::json& meta) unit, "'"); - name = axes[4]["name"]; - type = axes[4]["type"]; - unit = axes[4]["unit"]; + name = axes[2]["name"]; + type = axes[2]["type"]; + unit = axes[2]["unit"]; EXPECT(name == "x", "Expected name to be 'x', but got '", name, "'"); EXPECT( type == "space", "Expected type to be 'space', but got '", type, "'"); @@ -207,32 +159,26 @@ verify_group_metadata(const nlohmann::json& meta) type == "scale", "Expected type to be 'scale', but got '", type, "'"); const auto scale = coordinate_transformations["scale"]; - EXPECT_EQ(size_t, scale.size(), 5); - EXPECT_EQ(int, scale[0].get(), 1.0); - EXPECT_EQ(int, scale[1].get(), 1.0); - EXPECT_EQ(int, scale[2].get(), 1.4); - EXPECT_EQ(int, scale[3].get(), 0.9); - EXPECT_EQ(int, scale[4].get(), 0.9); + EXPECT_EQ(size_t, scale.size(), 3); + EXPECT_EQ(int, scale[0].get(), 1.4); + EXPECT_EQ(int, scale[1].get(), 0.9); + EXPECT_EQ(int, scale[2].get(), 0.9); } void verify_array_metadata(const nlohmann::json& meta) { const auto& shape = meta["shape"]; - EXPECT_EQ(size_t, shape.size(), 5); - EXPECT_EQ(int, shape[0].get(), array_timepoints); - EXPECT_EQ(int, shape[1].get(), array_channels); - EXPECT_EQ(int, shape[2].get(), array_planes); - EXPECT_EQ(int, shape[3].get(), array_height); - EXPECT_EQ(int, shape[4].get(), array_width); + EXPECT_EQ(size_t, shape.size(), 3); + EXPECT_EQ(int, shape[0].get(), array_planes); + EXPECT_EQ(int, shape[1].get(), array_height); + EXPECT_EQ(int, shape[2].get(), array_width); const auto& chunks = meta["chunk_grid"]["configuration"]["chunk_shape"]; - EXPECT_EQ(size_t, chunks.size(), 5); - EXPECT_EQ(int, chunks[0].get(), chunk_timepoints* shard_timepoints); - EXPECT_EQ(int, chunks[1].get(), chunk_channels* shard_channels); - EXPECT_EQ(int, chunks[2].get(), chunk_planes* shard_planes); - EXPECT_EQ(int, chunks[3].get(), chunk_height* shard_height); - EXPECT_EQ(int, chunks[4].get(), chunk_width* shard_width); + EXPECT_EQ(size_t, chunks.size(), 3); + EXPECT_EQ(int, chunks[0].get(), chunk_planes* shard_planes); + EXPECT_EQ(int, chunks[1].get(), chunk_height* shard_height); + EXPECT_EQ(int, chunks[2].get(), chunk_width* shard_width); const auto dtype = meta["data_type"].get(); EXPECT(dtype == "uint16", @@ -245,12 +191,10 @@ verify_array_metadata(const nlohmann::json& meta) const auto& sharding_codec = codecs[0]["configuration"]; const auto& shards = sharding_codec["chunk_shape"]; - EXPECT_EQ(size_t, shards.size(), 5); - EXPECT_EQ(int, shards[0].get(), chunk_timepoints); - EXPECT_EQ(int, shards[1].get(), chunk_channels); - EXPECT_EQ(int, shards[2].get(), chunk_planes); - EXPECT_EQ(int, shards[3].get(), chunk_height); - EXPECT_EQ(int, shards[4].get(), chunk_width); + EXPECT_EQ(size_t, shards.size(), 3); + EXPECT_EQ(int, shards[0].get(), chunk_planes); + EXPECT_EQ(int, shards[1].get(), chunk_height); + EXPECT_EQ(int, shards[2].get(), chunk_width); const auto& internal_codecs = sharding_codec["codecs"]; EXPECT(internal_codecs.size() == 1, @@ -262,85 +206,63 @@ verify_array_metadata(const nlohmann::json& meta) internal_codecs[0]["name"].get()); const auto& dimension_names = meta["dimension_names"]; - EXPECT_EQ(size_t, dimension_names.size(), 5); + EXPECT_EQ(size_t, dimension_names.size(), 3); - EXPECT(dimension_names[0].get() == "t", - "Expected first dimension name to be 't', got ", - dimension_names[0].get()); - EXPECT(dimension_names[1].get() == "c", - "Expected second dimension name to be 'c', got ", - dimension_names[1].get()); - EXPECT(dimension_names[2].get() == "z", + EXPECT(dimension_names[0].get() == "z", "Expected third dimension name to be 'z', got ", - dimension_names[2].get()); - EXPECT(dimension_names[3].get() == "y", + dimension_names[0].get()); + EXPECT(dimension_names[1].get() == "y", "Expected fourth dimension name to be 'y', got ", - dimension_names[3].get()); - EXPECT(dimension_names[4].get() == "x", + dimension_names[1].get()); + EXPECT(dimension_names[2].get() == "x", "Expected fifth dimension name to be 'x', got ", - dimension_names[4].get()); + dimension_names[2].get()); } void verify_file_data() { - const auto chunk_size = chunk_width * chunk_height * chunk_planes * - chunk_channels * chunk_timepoints * nbytes_px; + const auto chunk_size = + chunk_width * chunk_height * chunk_planes * nbytes_px; const auto index_size = chunks_per_shard * sizeof(uint64_t) * // indices are 64 bits 2; // 2 indices per chunk const auto checksum_size = 4; // crc32 checksum is 4 bytes - const auto expected_file_size = shard_width * shard_height * shard_planes * - shard_channels * shard_timepoints * - chunk_size + - index_size + checksum_size; + const auto expected_file_size = + shard_width * shard_height * shard_planes * chunk_size + index_size + + checksum_size; fs::path data_root = fs::path(test_path) / "0"; CHECK(fs::is_directory(data_root)); - for (auto t = 0; t < shards_in_t; ++t) { - const auto t_dir = data_root / "c" / std::to_string(t); - CHECK(fs::is_directory(t_dir)); - - for (auto c = 0; c < shards_in_c; ++c) { - const auto c_dir = t_dir / std::to_string(c); - CHECK(fs::is_directory(c_dir)); - - for (auto z = 0; z < shards_in_z; ++z) { - const auto z_dir = c_dir / std::to_string(z); - CHECK(fs::is_directory(z_dir)); - - for (auto y = 0; y < shards_in_y; ++y) { - const auto y_dir = z_dir / std::to_string(y); - CHECK(fs::is_directory(y_dir)); - - for (auto x = 0; x < shards_in_x; ++x) { - const auto x_file = y_dir / std::to_string(x); - CHECK(fs::is_regular_file(x_file)); - const auto file_size = fs::file_size(x_file); - EXPECT(file_size == expected_file_size, - "Expected file size == ", - expected_file_size, - " for file ", - x_file.string(), - ", got ", - file_size); - } - - CHECK(!fs::is_regular_file(y_dir / - std::to_string(shards_in_x))); - } - - CHECK(!fs::is_directory(z_dir / std::to_string(shards_in_y))); + for (auto z = 0; z < shards_in_z; ++z) { + const auto z_dir = data_root / "c" / std::to_string(z); + CHECK(fs::is_directory(z_dir)); + + for (auto y = 0; y < shards_in_y; ++y) { + const auto y_dir = z_dir / std::to_string(y); + CHECK(fs::is_directory(y_dir)); + + for (auto x = 0; x < shards_in_x; ++x) { + const auto x_file = y_dir / std::to_string(x); + CHECK(fs::is_regular_file(x_file)); + const auto file_size = fs::file_size(x_file); + EXPECT(file_size == expected_file_size, + "Expected file size == ", + expected_file_size, + " for file ", + x_file.string(), + ", got ", + file_size); } - CHECK(!fs::is_directory(c_dir / std::to_string(shards_in_z))); + CHECK(!fs::is_regular_file(y_dir / std::to_string(shards_in_x))); } - CHECK(!fs::is_directory(t_dir / std::to_string(shards_in_c))); + CHECK(!fs::is_directory(z_dir / std::to_string(shards_in_y))); } - CHECK(!fs::is_directory(data_root / "c" / std::to_string(shards_in_t))); + CHECK(!fs::is_directory(data_root / "c" / std::to_string(shards_in_z))); } void @@ -378,18 +300,22 @@ verify() int main() { - Zarr_set_log_level(ZarrLogLevel_Debug); + Zarr_set_log_level(ZarrLogLevel_Info); auto* stream = setup(); - std::vector frame(array_width * array_height, 0); + const std::vector frame(array_width * array_height, 0); int retval = 1; try { size_t bytes_out; for (auto i = 0; i < frames_to_acquire; ++i) { - ZarrStatusCode status = ZarrStream_append( + const auto t1 = std::chrono::high_resolution_clock::now(); + const ZarrStatusCode status = ZarrStream_append( stream, frame.data(), bytes_of_frame, &bytes_out, nullptr); + const auto t2 = std::chrono::high_resolution_clock::now(); + const std::chrono::duration fp_ms = t2 - t1; + LOG_INFO("Appending frame ", i, " took ", fp_ms.count(), " ms"); EXPECT(status == ZarrStatusCode_Success, "Failed to append frame ", i, @@ -398,7 +324,11 @@ main() EXPECT_EQ(size_t, bytes_out, bytes_of_frame); } + const auto t1 = std::chrono::high_resolution_clock::now(); ZarrStream_destroy(stream); + const auto t2 = std::chrono::high_resolution_clock::now(); + const std::chrono::duration fp_ms = t2 - t1; + LOG_INFO("Closing stream took ", fp_ms.count(), " ms"); verify(); From c818bd0ee3af30f319870e23aed85c74a83e0ac0 Mon Sep 17 00:00:00 2001 From: Alan Liddell Date: Thu, 30 Oct 2025 16:10:39 -0400 Subject: [PATCH 38/38] (wip): sprinkle some more omp fairydust --- src/streaming/array.cpp | 2 +- src/streaming/fs.array.cpp | 12 ++++++++++-- src/streaming/s3.array.cpp | 4 ++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/streaming/array.cpp b/src/streaming/array.cpp index 8efd3d0f..1456774f 100644 --- a/src/streaming/array.cpp +++ b/src/streaming/array.cpp @@ -116,7 +116,7 @@ zarr::Array::write_frame(std::vector& data) return 0; } - if (bytes_to_flush_ == 0) { // first frame, we need to init the buffers + if (frames_written_ == 0) { // first frame, we need to init the buffers fill_buffers_(); } diff --git a/src/streaming/fs.array.cpp b/src/streaming/fs.array.cpp index be6d0299..79df421a 100644 --- a/src/streaming/fs.array.cpp +++ b/src/streaming/fs.array.cpp @@ -215,6 +215,11 @@ zarr::FSArray::compress_and_flush_data_() const auto& params = config_->compression_params; + const size_t future_offset = shard_file->chunk_futures.size(); + shard_file->chunk_futures.resize(shard_file->chunk_futures.size() + + chunk_indices_this_layer.size()); + +#pragma omp parallel for for (auto i = 0; i < chunk_indices_this_layer.size(); ++i) { const uint32_t chunk_idx = chunk_indices_this_layer[i]; CHECK(chunk_idx >= chunk_offset); @@ -226,7 +231,8 @@ zarr::FSArray::compress_and_flush_data_() auto& chunk_data = chunk_buffers_[chunk_idx - chunk_offset]; const size_t bytes_of_chunk = chunk_data.size(); - shard_file->chunk_futures.push_back(promise->get_future()); + shard_file->chunk_futures[i + future_offset] = promise->get_future(); + // shard_file->chunk_futures.push_back(promise->get_future()); auto job = [chunk_data = std::move(chunk_data), ¶ms, @@ -314,7 +320,9 @@ zarr::FSArray::compress_and_flush_data_() } if (!is_closing_) { - chunk_buffers_[chunk_idx - chunk_offset].resize(bytes_of_chunk); + auto& chunk = chunk_buffers_[chunk_idx - chunk_offset]; + chunk.resize(bytes_of_chunk); + std::ranges::fill(chunk, 0); } } diff --git a/src/streaming/s3.array.cpp b/src/streaming/s3.array.cpp index 3b55d916..525ed3af 100644 --- a/src/streaming/s3.array.cpp +++ b/src/streaming/s3.array.cpp @@ -61,6 +61,10 @@ zarr::S3Array::compress_and_flush_data_() return false; } + if (!is_closing_) { + fill_buffers_(); + } + return true; }