From e73ef14eb3ab472054e0e2397414d4d799c9d55d Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 22 Oct 2025 12:35:06 -0400 Subject: [PATCH 1/3] chore: add `IndexedList::splice` method and other tweaks (#5670) This pull request is made in support of #5655 which remains in draft; the intent is to pull these pieces out of that in order to have a more focused code review of both pull requests. We have a container class `IndexedList` which is used to store non-movable data (and thus not usable in `std::vector`) with indexing (and thus not usable in `std::list`). The implementation uses both `std::list` and `std::vector` underneath. This pull request makes some improvements to this class: - we add the `splice` method whose semantics are the same as those of the `std::list` `splice` method. This method transfers elements from one `IndexedList` to another. In support of this we add new `iterator` and `const_iterator` types. - we provide the constructor definition in the header so that implementations do not have to declare their own specialization. - we update the `resize` method to accept variadic copyable arguments which will be used to initialize each of the new elements of the container. We also add unit tests for `splice` correctness, and `resize` avoiding the move constructor. --- TYPE: NO_HISTORY DESC: Add `IndexedList::splice` --- tiledb/common/indexed_list.h | 215 ++++++++++++-- tiledb/common/test/CMakeLists.txt | 6 + tiledb/common/test/unit_indexed_list.cc | 264 ++++++++++++++++++ tiledb/sm/buffer/buffer_list.cc | 16 +- tiledb/sm/query/legacy/reader.cc | 6 +- tiledb/sm/query/readers/reader_base.cc | 7 - .../sm/query/writers/global_order_writer.cc | 6 +- tiledb/sm/query/writers/ordered_writer.cc | 7 +- tiledb/sm/query/writers/unordered_writer.cc | 3 +- tiledb/sm/query/writers/writer_base.cc | 14 - 10 files changed, 479 insertions(+), 65 deletions(-) create mode 100644 tiledb/common/test/unit_indexed_list.cc diff --git a/tiledb/common/indexed_list.h b/tiledb/common/indexed_list.h index ec1f794b8a6..b9ab5fd3b55 100644 --- a/tiledb/common/indexed_list.h +++ b/tiledb/common/indexed_list.h @@ -38,12 +38,13 @@ #include #include -namespace tiledb::sm { -class MemoryTracker; -} - namespace tiledb::common { +namespace detail { +template +class WhiteboxIndexedList; +} + /** * Container class for data that cannot be moved but that we want to access by * an index. @@ -52,7 +53,136 @@ namespace tiledb::common { */ template class IndexedList { + using Self = IndexedList; + using Elements = tdb::pmr::list; + using Indexes = std::vector; + + friend class detail::WhiteboxIndexedList; + public: + /** + * Iterator handle to a mutable element of an `IndexedList`. + */ + class iterator { + Elements::iterator elt_; + Indexes::iterator idx_; + + friend class IndexedList; + + iterator(Elements::iterator elt, Indexes::iterator idx) + : elt_(elt) + , idx_(idx) { + } + + public: + using difference_type = int64_t; + using value_type = T; + + T& operator*() const { + return *elt_; + } + + T* operator->() const { + return &*elt_; + } + + iterator& operator++() { + ++elt_; + ++idx_; + return *this; + } + + iterator& operator++(int) { + elt_++; + idx_++; + return *this; + } + + iterator& operator--() { + --elt_; + --idx_; + return *this; + } + + iterator& operator--(int) { + elt_--; + idx_--; + return *this; + } + + bool operator==(const iterator& other) const { + return elt_ == other.elt_; + } + + bool operator!=(const iterator& other) const { + return !(*this == other); + } + }; + + /** + * Iterator handle to an immutable element of an `IndexedList`. + */ + class const_iterator { + Elements::const_iterator elt_; + Indexes::const_iterator idx_; + + friend class IndexedList; + + const_iterator(Elements::const_iterator elt, Indexes::const_iterator idx) + : elt_(elt) + , idx_(idx) { + } + + public: + using difference_type = int64_t; + using value_type = T; + + const_iterator(IndexedList::iterator ii) + : elt_(ii.elt_) + , idx_(ii.idx_) { + } + + const T& operator*() const { + return *elt_; + } + + const T* operator->() const { + return &*elt_; + } + + const_iterator& operator++() { + ++elt_; + ++idx_; + return *this; + } + + const_iterator& operator++(int) { + elt_++; + idx_++; + return *this; + } + + const_iterator& operator--() { + --elt_; + --idx_; + return *this; + } + + const_iterator& operator--(int) { + elt_--; + idx_--; + return *this; + } + + bool operator==(const const_iterator& other) const { + return elt_ == other.elt_; + } + + bool operator!=(const const_iterator& other) const { + return !(*this == other); + } + }; + /* ********************************* */ /* CONSTRUCTORS & DESTRUCTORS */ /* ********************************* */ @@ -65,7 +195,9 @@ class IndexedList { * * @param memory_tracker The memory tracker for the underlying containers. */ - explicit IndexedList(shared_ptr memory_tracker); + explicit IndexedList(pmr::memory_resource* resource) + : list_(resource) { + } DISABLE_COPY_AND_COPY_ASSIGN(IndexedList); DISABLE_MOVE_AND_MOVE_ASSIGN(IndexedList); @@ -92,24 +224,20 @@ class IndexedList { return list_.get_allocator(); } - /** Returns an iterator to the beginning of the items. */ - typename std::list::iterator begin() { - return list_.begin(); + iterator begin() { + return iterator(list_.begin(), vec_.begin()); } - /** Returns an iterator to the beginning of the items. */ - typename std::list::const_iterator begin() const { - return list_.begin(); + const_iterator begin() const { + return const_iterator(list_.begin(), vec_.begin()); } - /** Returns an iterator to the end of the items. */ - typename std::list::iterator end() { - return list_.end(); + iterator end() { + return iterator(list_.end(), vec_.end()); } - /** Returns an iterator to the end of the items. */ - typename std::list::const_iterator end() const { - return list_.end(); + const_iterator end() const { + return const_iterator(list_.end(), vec_.end()); } /** Returns wether the container is empty or not. */ @@ -144,7 +272,8 @@ class IndexedList { * * @param num Number of items to add. */ - void resize(size_t num) { + template + void resize(size_t num, Args... args) { if (list_.size() != 0 || vec_.size() != 0) { throw std::logic_error( "Resize should only be called on empty container."); @@ -152,7 +281,12 @@ class IndexedList { vec_.reserve(num); for (uint64_t n = 0; n < num; n++) { - emplace_back(memory_tracker_); + std::tuple...> copied_args(args...); + std::apply( + [this](auto&&... copied_arg) { + this->emplace_back(std::move(copied_arg)...); + }, + copied_args); } } @@ -204,15 +338,48 @@ class IndexedList { return *(vec_.at(index)); } - private: - /** The memory tracker for the underlying list. */ - shared_ptr memory_tracker_; + /** + * @return a pointer to the last element of the list + */ + T* back() { + if (empty()) { + return nullptr; + } else { + return vec_.back(); + } + } + /** + * @return a pointer to the last element of the list + */ + const T* back() const { + if (empty()) { + return nullptr; + } else { + return vec_.back(); + } + } + + /** + * Transfers the elements from `other` between its positions `first` and + * `last` to `*this`. The elements are inserted at `pos`. + */ + void splice( + const_iterator pos, + Self& other, + const_iterator first, + const_iterator last) { + list_.splice(pos.elt_, other.list_, first.elt_, last.elt_); + vec_.insert(pos.idx_, first.idx_, last.idx_); + other.vec_.erase(first.idx_, last.idx_); + } + + private: /** List that contains all the elements. */ - tdb::pmr::list list_; + Elements list_; /** Vector that contains a pointer to the elements allowing indexed access. */ - std::vector vec_; + Indexes vec_; }; } // namespace tiledb::common diff --git a/tiledb/common/test/CMakeLists.txt b/tiledb/common/test/CMakeLists.txt index 4fcc39ec362..a8bee29aee4 100644 --- a/tiledb/common/test/CMakeLists.txt +++ b/tiledb/common/test/CMakeLists.txt @@ -34,6 +34,12 @@ commence(unit_test arithmetic) endif () conclude(unit_test) +commence(unit_test indexed_list) + this_target_sources(main.cc unit_indexed_list.cc ${CMAKE_SOURCE_DIR}/test/support/src/mem_helpers.cc) + this_target_object_libraries(baseline) + this_target_link_libraries(rapidcheck) +conclude(unit_test) + commence(unit_test memory_tracker) this_target_sources(main.cc unit_memory_tracker.cc unit_memory_tracker_types.cc) this_target_object_libraries(baseline) diff --git a/tiledb/common/test/unit_indexed_list.cc b/tiledb/common/test/unit_indexed_list.cc new file mode 100644 index 00000000000..ef4869979a5 --- /dev/null +++ b/tiledb/common/test/unit_indexed_list.cc @@ -0,0 +1,264 @@ +/** + * @file unit_indexed_list.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file tests the `IndexedList` data structure. + */ + +#include +#include +#include + +#include "tiledb/common/indexed_list.h" +#include "tiledb/common/memory_tracker.h" + +#include +#include + +using namespace tiledb::common; +using namespace tiledb::test; + +static_assert(std::input_iterator::iterator>); +static_assert(std::input_iterator::const_iterator>); + +namespace tiledb::common::detail { + +template +class WhiteboxIndexedList : public IndexedList { + public: + using IndexedList::list_; + using IndexedList::vec_; + + void integrity_check() const { + REQUIRE(list_.size() == vec_.size()); + + auto value = list_.begin(); + for (uint64_t i = 0; i < vec_.size(); ++i, ++value) { + REQUIRE(vec_[i] == &*value); + } + } +}; + +} // namespace tiledb::common::detail + +template +void integrity_check(const IndexedList& value) { + static_cast&>(value).integrity_check(); +} + +template +std::vector instance_iterator(std::vector values_in) { + auto mem_tracker = get_test_memory_tracker(); + auto mem_res = mem_tracker->get_resource(tiledb::sm::MemoryType::WRITER_DATA); + + IndexedList ii(mem_res); + for (const auto& value : values_in) { + ii.emplace_back(value); + } + + const std::vector values_out(ii.begin(), ii.end()); + + CHECK(values_in == values_out); + + integrity_check(ii); + + return values_out; +} + +TEST_CASE("IndexedList iterator", "[algorithm]") { + const uint64_t num_values = GENERATE(0, 1, 2, 4, 8, 16, 32); + + DYNAMIC_SECTION("num_values = " << num_values) { + std::vector values; + values.resize(num_values); + std::iota(values.begin(), values.end(), 0); + + instance_iterator(values); + } +} + +template +std::pair, std::list> instance_splice( + std::list values_in, + uint64_t insert_pos, + std::list values_splice, + uint64_t splice_first, + uint64_t splice_last) { + auto mem_tracker = get_test_memory_tracker(); + auto mem_res = mem_tracker->get_resource(tiledb::sm::MemoryType::WRITER_DATA); + + IndexedList idst(mem_res); + IndexedList isplice(mem_res); + + for (const auto& value : values_in) { + idst.emplace_back(value); + } + for (const auto& value : values_splice) { + isplice.emplace_back(value); + } + + idst.splice( + std::next(idst.begin(), insert_pos), + isplice, + std::next(isplice.begin(), splice_first), + std::next(isplice.begin(), splice_last)); + + const std::list values_out(idst.begin(), idst.end()); + const std::list splice_out(isplice.begin(), isplice.end()); + + // check generic correctness + values_in.splice( + std::next(values_in.begin(), insert_pos), + values_splice, + std::next(values_splice.begin(), splice_first), + std::next(values_splice.begin(), splice_last)); + CHECK(values_out == values_in); + CHECK(splice_out == values_splice); + + integrity_check(idst); + integrity_check(isplice); + + return std::make_pair(values_out, splice_out); +} + +TEST_CASE("IndexedList splice", "[algorithm]") { + SECTION("Trivial") { + const auto r = instance_splice({}, 0, {}, 0, 0); + CHECK(r.first == std::list{}); + CHECK(r.second == std::list{}); + } + + SECTION("Transfer all to empty") { + const auto r = instance_splice({}, 0, {0, 1, 2, 3}, 0, 4); + CHECK(r.first == std::list{0, 1, 2, 3}); + CHECK(r.second == std::list{}); + } + + SECTION("Transfer subset to empty") { + const auto r = instance_splice({}, 0, {0, 1, 2, 3}, 1, 3); + CHECK(r.first == std::list{1, 2}); + CHECK(r.second == std::list{0, 3}); + } + + SECTION("Transfer empty to nonempty") { + const auto r = instance_splice({0, 1, 2, 3}, 0, {}, 0, 0); + CHECK(r.first == std::list{0, 1, 2, 3}); + CHECK(r.second == std::list{}); + } + + SECTION("Transfer to nonempty end") { + const auto r = + instance_splice({0, 1, 2, 3}, 4, {4, 5, 6, 7}, 1, 3); + CHECK(r.first == std::list{0, 1, 2, 3, 5, 6}); + CHECK(r.second == std::list{4, 7}); + } + + SECTION("Transfer to nonempty intermediate position") { + const auto r = + instance_splice({0, 1, 2, 3}, 2, {4, 5, 6, 7}, 1, 3); + CHECK(r.first == std::list{0, 1, 5, 6, 2, 3}); + CHECK(r.second == std::list{4, 7}); + } +} + +TEST_CASE("IndexedList splice rapidcheck", "[algorithm][rapidcheck]") { + rc::prop( + "Splice correctness", + [](std::list target, std::list src) { + const uint64_t targetpos = + *rc::gen::inRange(0, target.size() + 1); + const uint64_t srcfirst = + *rc::gen::inRange(0, src.size() + 1); + const uint64_t srclast = + *rc::gen::inRange(srcfirst, src.size() + 1); + + instance_splice(target, targetpos, src, srcfirst, srclast); + }); +} + +/** + * Test that `IndexedList::resize` correctly initializes each new element + * by copying the variadic arguments for each element. + */ +TEST_CASE("IndexedList resize copy args", "[algorithm]") { + auto mem_tracker = get_test_memory_tracker(); + auto mem_res = mem_tracker->get_resource(tiledb::sm::MemoryType::WRITER_DATA); + + SECTION("POD") { + SECTION("Default") { + IndexedList ll(mem_res); + ll.resize(8); + + std::vector vv(ll.begin(), ll.end()); + CHECK(vv == std::vector{0, 0, 0, 0, 0, 0, 0, 0}); + } + SECTION("Copy value") { + IndexedList ll(mem_res); + ll.resize(8, 123); + + std::vector vv(ll.begin(), ll.end()); + CHECK( + vv == std::vector{123, 123, 123, 123, 123, 123, 123, 123}); + } + } + + SECTION("shared_ptr") { + SECTION("Default") { + IndexedList> ll(mem_res); + ll.resize(8); + + std::vector> vv(ll.begin(), ll.end()); + CHECK( + vv == std::vector>{ + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr}); + } + + /** + * Notably this fails if the move constructor rather than the copy + * constructor is used, since the first move steals the value. + */ + SECTION("Alias") { + std::shared_ptr value(new uint64_t(123)); + + IndexedList> ll(mem_res); + ll.resize(8, value); + + std::vector> vv(ll.begin(), ll.end()); + CHECK( + vv == std::vector>{ + value, value, value, value, value, value, value, value}); + } + } +} diff --git a/tiledb/sm/buffer/buffer_list.cc b/tiledb/sm/buffer/buffer_list.cc index 32c4bd3d583..02315487125 100644 --- a/tiledb/sm/buffer/buffer_list.cc +++ b/tiledb/sm/buffer/buffer_list.cc @@ -38,19 +38,10 @@ using namespace tiledb::common; -namespace tiledb { -template <> -IndexedList::IndexedList( - shared_ptr memory_tracker) - : memory_tracker_(memory_tracker) - , list_( - memory_tracker->get_resource(sm::MemoryType::SERIALIZATION_BUFFER)) { -} - -namespace sm { +namespace tiledb::sm { BufferList::BufferList(shared_ptr memory_tracker) - : buffers_(memory_tracker) + : buffers_(memory_tracker->get_resource(MemoryType::SERIALIZATION_BUFFER)) , current_buffer_index_(0) , current_relative_offset_(0) , offset_(0) { @@ -154,5 +145,4 @@ uint64_t BufferList::total_size() const { return size; } -} // namespace sm -} // namespace tiledb +} // namespace tiledb::sm diff --git a/tiledb/sm/query/legacy/reader.cc b/tiledb/sm/query/legacy/reader.cc index bb8c85d7929..36dbfb111a1 100644 --- a/tiledb/sm/query/legacy/reader.cc +++ b/tiledb/sm/query/legacy/reader.cc @@ -1804,7 +1804,8 @@ Status Reader::dense_read() { // `sparse_result_tiles` will hold all the relevant result tiles of // sparse fragments std::vector result_coords; - IndexedList sparse_result_tiles(query_memory_tracker_); + IndexedList sparse_result_tiles( + query_memory_tracker_->get_resource(MemoryType::RESULT_TILE)); RETURN_NOT_OK(compute_result_coords(sparse_result_tiles, result_coords)); // Compute result cell slabs. @@ -2043,7 +2044,8 @@ Status Reader::sparse_read() { // `sparse_result_tiles` will hold all the relevant result tiles of // sparse fragments std::vector result_coords; - IndexedList sparse_result_tiles(query_memory_tracker_); + IndexedList sparse_result_tiles( + query_memory_tracker_->get_resource(MemoryType::RESULT_TILE)); RETURN_NOT_OK(compute_result_coords(sparse_result_tiles, result_coords)); std::vector result_tiles; diff --git a/tiledb/sm/query/readers/reader_base.cc b/tiledb/sm/query/readers/reader_base.cc index fe1b022af87..7cdb1230d5e 100644 --- a/tiledb/sm/query/readers/reader_base.cc +++ b/tiledb/sm/query/readers/reader_base.cc @@ -1459,10 +1459,3 @@ template void ReaderBase::validate_attribute_order( std::vector&); } // namespace tiledb::sm - -template <> -IndexedList::IndexedList( - shared_ptr memory_tracker) - : memory_tracker_(memory_tracker) - , list_(memory_tracker->get_resource(sm::MemoryType::RESULT_TILE)) { -} diff --git a/tiledb/sm/query/writers/global_order_writer.cc b/tiledb/sm/query/writers/global_order_writer.cc index 21254a764f2..38b3a3523ba 100644 --- a/tiledb/sm/query/writers/global_order_writer.cc +++ b/tiledb/sm/query/writers/global_order_writer.cc @@ -206,7 +206,8 @@ Status GlobalOrderWriter::init_global_write_state() { auto last_tiles_it = global_write_state_->last_tiles_.emplace( std::piecewise_construct, std::forward_as_tuple(name), - std::forward_as_tuple(query_memory_tracker_)); + std::forward_as_tuple( + query_memory_tracker_->get_resource(MemoryType::WRITER_TILE_DATA))); last_tiles_it.first->second.emplace_back( array_schema_, cell_num_per_tile, @@ -862,7 +863,8 @@ Status GlobalOrderWriter::prepare_full_tiles( tiles->emplace( std::piecewise_construct, std::forward_as_tuple(it.first), - std::forward_as_tuple(query_memory_tracker_)); + std::forward_as_tuple( + query_memory_tracker_->get_resource(MemoryType::WRITER_TILE_DATA))); } auto num = buffers_.size(); diff --git a/tiledb/sm/query/writers/ordered_writer.cc b/tiledb/sm/query/writers/ordered_writer.cc index 3a3ec42e04e..83be8e671f7 100644 --- a/tiledb/sm/query/writers/ordered_writer.cc +++ b/tiledb/sm/query/writers/ordered_writer.cc @@ -205,7 +205,8 @@ Status OrderedWriter::ordered_write() { tiles.emplace( std::piecewise_construct, std::forward_as_tuple(buff.first), - std::forward_as_tuple(query_memory_tracker_)); + std::forward_as_tuple( + query_memory_tracker_->get_resource(MemoryType::WRITER_TILE_DATA))); } if (attr_num > tile_num) { // Parallelize over attributes @@ -316,7 +317,9 @@ Status OrderedWriter::prepare_filter_and_write_tiles( // Process batches uint64_t frag_tile_id = 0; bool close_files = false; - tile_batches.resize(batch_num); + tile_batches.resize( + batch_num, + query_memory_tracker_->get_resource(MemoryType::WRITER_TILE_DATA)); std::optional write_task = nullopt; for (uint64_t b = 0; b < batch_num; ++b) { auto batch_size = (b == batch_num - 1) ? last_batch_size : thread_num; diff --git a/tiledb/sm/query/writers/unordered_writer.cc b/tiledb/sm/query/writers/unordered_writer.cc index 1779a204253..ed568d2fa87 100644 --- a/tiledb/sm/query/writers/unordered_writer.cc +++ b/tiledb/sm/query/writers/unordered_writer.cc @@ -376,7 +376,8 @@ Status UnorderedWriter::prepare_tiles( tiles->emplace( std::piecewise_construct, std::forward_as_tuple(name), - std::forward_as_tuple(query_memory_tracker_)); + std::forward_as_tuple(query_memory_tracker_->get_resource( + MemoryType::WRITER_TILE_DATA))); } } diff --git a/tiledb/sm/query/writers/writer_base.cc b/tiledb/sm/query/writers/writer_base.cc index 4962018733e..3e13b990dc9 100644 --- a/tiledb/sm/query/writers/writer_base.cc +++ b/tiledb/sm/query/writers/writer_base.cc @@ -1185,17 +1185,3 @@ bool WriterBase::remote_query() const { } } // namespace tiledb::sm - -template <> -IndexedList::IndexedList( - shared_ptr memory_tracker) - : memory_tracker_(memory_tracker) - , list_(memory_tracker->get_resource(sm::MemoryType::WRITER_TILE_DATA)) { -} - -template <> -IndexedList>:: - IndexedList(shared_ptr memory_tracker) - : memory_tracker_(memory_tracker) - , list_(memory_tracker->get_resource(sm::MemoryType::WRITER_TILE_DATA)) { -} From 8fcdd4d80b61f98c5fc3bc784b99cfac129037f2 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Fri, 31 Oct 2025 07:25:18 -0400 Subject: [PATCH 2/3] chore: refactor and improve test data structure `struct Fragment` (#5675) This pull request does some refactoring and enhancing of our test data structures `struct Fragment1D` and `struct Fragment2D` which are used as containers for inputs to write queries and outputs of read queries. 1) we move the methods which generically create arrays using these and write fragments using these to a header file instead of `sparse_global_order_reader.cc` 2) we refactor functionality into a `template struct Fragment` which is a common base class for `Fragment1D`, `Fragment2D`, a new `Fragment3D` and can be used with an empty dimension tuple for dense fragments 3) we implement various fixes to enable variable-length dimensions 4) some other incidentals This pull request has been refactored out of #5646 for two reasons: 1) it is all enhancements to test code which may be distracting from the functional components of #5646 ; 2) I anticipate using it to add additional tests to #5655 . Specifically, using the `Fragment` with an empty dimension tuple will be very nice for writing tests on dense arrays. There's no direct testing of this, but since the code was factored out of #5646 all of it was used in testing there. Many components are exercised due to the refactor of the existing tests. --- TYPE: NO_HISTORY DESC: refactors and enhancements to test data structure `struct Fragment` --- test/src/unit-sparse-global-order-reader.cc | 218 +++------ test/support/rapidcheck/array_templates.h | 144 +++++- test/support/src/array_templates.h | 477 ++++++++++++++++---- tiledb/common/types/untyped_datum.h | 10 +- tiledb/sm/cpp_api/array.h | 5 + tiledb/sm/cpp_api/query.h | 28 ++ 6 files changed, 633 insertions(+), 249 deletions(-) diff --git a/test/src/unit-sparse-global-order-reader.cc b/test/src/unit-sparse-global-order-reader.cc index 0046caae331..8c5c75d730a 100644 --- a/test/src/unit-sparse-global-order-reader.cc +++ b/test/src/unit-sparse-global-order-reader.cc @@ -189,7 +189,7 @@ struct FxRun1D { if (subarray.empty()) { return true; } else { - const CoordType coord = fragment.dim_[record]; + const CoordType coord = fragment.dimension()[record]; for (const auto& range : subarray) { if (range.contains(coord)) { return true; @@ -349,7 +349,7 @@ struct FxRun2D { if (subarray.empty() && !condition.has_value()) { return true; } else { - const int r = fragment.d1_[record], c = fragment.d2_[record]; + const int r = fragment.d1()[record], c = fragment.d2()[record]; for (const auto& range : subarray) { if (range.first.has_value() && !range.first->contains(r)) { continue; @@ -649,36 +649,11 @@ void CSparseGlobalOrderFx::write_fragment( } CApiArray& array = *existing; + Context cppctx = vfs_test_setup_.ctx(); + Array cpparray(cppctx, array, false); - // Create the query. - tiledb_query_t* query; - auto rc = tiledb_query_alloc(context(), array, TILEDB_WRITE, &query); - ASSERTER(rc == TILEDB_OK); - rc = tiledb_query_set_layout(context(), query, TILEDB_UNORDERED); - ASSERTER(rc == TILEDB_OK); - - auto field_sizes = templates::query::make_field_sizes(fragment); - templates::query::set_fields( - context(), - query, - field_sizes, - fragment, - [](unsigned d) { return "d" + std::to_string(d + 1); }, - [](unsigned a) { return "a" + std::to_string(a + 1); }); - - // Submit query. - rc = tiledb_query_submit(context(), query); - ASSERTER(std::optional() == error_if_any(rc)); - - // check that sizes match what we expect - const uint64_t expect_num_cells = fragment.size(); - const uint64_t num_cells = - templates::query::num_cells(fragment, field_sizes); - - ASSERTER(num_cells == expect_num_cells); - - // Clean up. - tiledb_query_free(&query); + templates::query::write_fragment( + fragment, cpparray, TILEDB_UNORDERED); } void CSparseGlobalOrderFx::write_1d_fragment_strings( @@ -1361,23 +1336,23 @@ TEST_CASE_METHOD( // Write a fragment F0 with unique coordinates InstanceType::FragmentType fragment0; - fragment0.dim_.resize(fragment_size); - std::iota(fragment0.dim_.begin(), fragment0.dim_.end(), 1); + fragment0.dimension().resize(fragment_size); + std::iota(fragment0.dimension().begin(), fragment0.dimension().end(), 1); // Write a fragment F1 with lots of duplicates // [100,100,100,100,100,101,101,101,101,101,102,102,102,102,102,...] InstanceType::FragmentType fragment1; - fragment1.dim_.resize(fragment0.dim_.num_cells()); - for (size_t i = 0; i < fragment1.dim_.num_cells(); i++) { - fragment1.dim_[i] = - static_cast((i / 10) + (fragment0.dim_.num_cells() / 2)); + fragment1.dimension().resize(fragment0.dimension().num_cells()); + for (size_t i = 0; i < fragment1.dimension().num_cells(); i++) { + fragment1.dimension()[i] = + static_cast((i / 10) + (fragment0.dimension().num_cells() / 2)); } // atts are whatever, used just for query condition and correctness check auto& f0atts = std::get<0>(fragment0.atts_); - f0atts.resize(fragment0.dim_.num_cells()); + f0atts.resize(fragment0.dimension().num_cells()); std::iota(f0atts.begin(), f0atts.end(), 0); - for (uint64_t i = 0; i < fragment0.dim_.num_cells(); i++) { + for (uint64_t i = 0; i < fragment0.dimension().num_cells(); i++) { if ((i * i) % 7 == 0) { std::get<1>(fragment0.atts_).push_back(std::nullopt); } else { @@ -1390,9 +1365,9 @@ TEST_CASE_METHOD( } auto& f1atts = std::get<0>(fragment1.atts_); - f1atts.resize(fragment1.dim_.num_cells()); - std::iota(f1atts.begin(), f1atts.end(), int(fragment0.dim_.num_cells())); - for (uint64_t i = 0; i < fragment1.dim_.num_cells(); i++) { + f1atts.resize(fragment1.dimension().num_cells()); + std::iota(f1atts.begin(), f1atts.end(), int(fragment0.num_cells())); + for (uint64_t i = 0; i < fragment1.num_cells(); i++) { if ((i * i) % 11 == 0) { std::get<1>(fragment1.atts_).push_back(std::nullopt); } else { @@ -1492,25 +1467,25 @@ TEST_CASE_METHOD( templates::Fragment1D fragment1; // Write a fragment F0 with tiles [1,3][3,5][5,7][7,9]... - fragment0.dim_.resize(fragment_size); - fragment0.dim_[0] = 1; - for (size_t i = 1; i < fragment0.dim_.num_cells(); i++) { - fragment0.dim_[i] = static_cast(1 + 2 * ((i + 1) / 2)); + fragment0.dimension().resize(fragment_size); + fragment0.dimension()[0] = 1; + for (size_t i = 1; i < fragment0.dimension().num_cells(); i++) { + fragment0.dimension()[i] = static_cast(1 + 2 * ((i + 1) / 2)); } // Write a fragment F1 with tiles [2,4][4,6][6,8][8,10]... - fragment1.dim_.resize(fragment0.dim_.num_cells()); - for (size_t i = 0; i < fragment1.dim_.num_cells(); i++) { - fragment1.dim_[i] = fragment0.dim_[i] + 1; + fragment1.dimension().resize(fragment0.dimension().num_cells()); + for (size_t i = 0; i < fragment1.dimension().num_cells(); i++) { + fragment1.dimension()[i] = fragment0.dimension()[i] + 1; } // atts don't really matter auto& f0atts = std::get<0>(fragment0.atts_); - f0atts.resize(fragment0.dim_.num_cells()); + f0atts.resize(fragment0.dimension().num_cells()); std::iota(f0atts.begin(), f0atts.end(), 0); auto& f1atts = std::get<0>(fragment1.atts_); - f1atts.resize(fragment1.dim_.num_cells()); + f1atts.resize(fragment1.dimension().num_cells()); std::iota(f1atts.begin(), f1atts.end(), int(f0atts.num_cells())); FxRun1D instance; @@ -1614,10 +1589,10 @@ TEST_CASE_METHOD( for (size_t f = 0; f < num_fragments; f++) { templates::Fragment1D fragment; - fragment.dim_.resize(fragment_size); + fragment.dimension().resize(fragment_size); std::iota( - fragment.dim_.begin(), - fragment.dim_.end(), + fragment.dimension().begin(), + fragment.dimension().end(), instance.array.dimension_.domain.lower_bound + static_cast(f)); auto& atts = std::get<0>(fragment.atts_); @@ -1741,10 +1716,10 @@ TEST_CASE_METHOD( for (size_t f = 0; f < num_fragments; f++) { templates::Fragment1D fragment; - fragment.dim_.resize(fragment_size); + fragment.dimension().resize(fragment_size); std::iota( - fragment.dim_.begin(), - fragment.dim_.end(), + fragment.dimension().begin(), + fragment.dimension().end(), static_cast(f * (fragment_size - 1))); auto& atts = std::get<0>(fragment.atts_); @@ -1922,13 +1897,13 @@ TEST_CASE_METHOD( for (size_t f = 0; f < num_fragments; f++) { templates::Fragment2D fdata; - fdata.d1_.reserve(fragment_size); - fdata.d2_.reserve(fragment_size); + fdata.d1().reserve(fragment_size); + fdata.d2().reserve(fragment_size); std::get<0>(fdata.atts_).reserve(fragment_size); for (size_t i = 0; i < fragment_size; i++) { - fdata.d1_.push_back(row(f, i)); - fdata.d2_.push_back(col(f, i)); + fdata.d1().push_back(row(f, i)); + fdata.d2().push_back(col(f, i)); std::get<0>(fdata.atts_) .push_back(static_cast(f * fragment_size + i)); } @@ -2126,34 +2101,34 @@ TEST_CASE_METHOD( const int tcol = instance.d2.domain.lower_bound + static_cast(f * instance.d2.extent); for (int i = 0; i < instance.d1.extent * instance.d2.extent - 2; i++) { - fdata.d1_.push_back(trow + i / instance.d1.extent); - fdata.d2_.push_back(tcol + i % instance.d1.extent); + fdata.d1().push_back(trow + i / instance.d1.extent); + fdata.d2().push_back(tcol + i % instance.d1.extent); std::get<0>(fdata.atts_).push_back(att++); } // then some sparse coords in the next space tile, // fill the data tile (if the capacity is 4), we'll call it T - fdata.d1_.push_back(trow); - fdata.d2_.push_back(tcol + instance.d2.extent); + fdata.d1().push_back(trow); + fdata.d2().push_back(tcol + instance.d2.extent); std::get<0>(fdata.atts_).push_back(att++); - fdata.d1_.push_back(trow + instance.d1.extent - 1); - fdata.d2_.push_back(tcol + instance.d2.extent + 2); + fdata.d1().push_back(trow + instance.d1.extent - 1); + fdata.d2().push_back(tcol + instance.d2.extent + 2); std::get<0>(fdata.atts_).push_back(att++); // then begin a new data tile "Tnext" which straddles the bounds of that // space tile. this will have a low MBR. - fdata.d1_.push_back(trow + instance.d1.extent - 1); - fdata.d2_.push_back(tcol + instance.d2.extent + 3); + fdata.d1().push_back(trow + instance.d1.extent - 1); + fdata.d2().push_back(tcol + instance.d2.extent + 3); std::get<0>(fdata.atts_).push_back(att++); - fdata.d1_.push_back(trow); - fdata.d2_.push_back(tcol + 2 * instance.d2.extent); + fdata.d1().push_back(trow); + fdata.d2().push_back(tcol + 2 * instance.d2.extent); std::get<0>(fdata.atts_).push_back(att++); // then add a point P which is less than the lower bound of Tnext's MBR, // and also between the last two coordinates of T FxRun2D::FragmentType fpoint; - fpoint.d1_.push_back(trow + instance.d1.extent - 1); - fpoint.d2_.push_back(tcol + instance.d1.extent + 1); + fpoint.d1().push_back(trow + instance.d1.extent - 1); + fpoint.d2().push_back(tcol + instance.d1.extent + 1); std::get<0>(fpoint.atts_).push_back(att++); instance.fragments.push_back(fdata); @@ -2268,13 +2243,13 @@ TEST_CASE_METHOD( for (size_t f = 0; f < num_fragments; f++) { FxRunType::FragmentType fragment; - fragment.dim_.resize(fragment_size); + fragment.dimension().resize(fragment_size); std::iota( - fragment.dim_.begin(), - fragment.dim_.end(), + fragment.dimension().begin(), + fragment.dimension().end(), dimension.domain.lower_bound); - std::get<0>(fragment.atts_).resize(fragment.dim_.num_cells()); + std::get<0>(fragment.atts_).resize(fragment.dimension().num_cells()); std::iota( std::get<0>(fragment.atts_).begin(), std::get<0>(fragment.atts_).end(), @@ -3218,8 +3193,8 @@ TEST_CASE_METHOD( for (uint64_t t = 0; t < fragment_same_timestamp_runs.size(); t++) { for (uint64_t f = 0; f < fragment_same_timestamp_runs[t]; f++) { FxRun2D::FragmentType fragment; - fragment.d1_ = {1, 2 + static_cast(t)}; - fragment.d2_ = {1, 2 + static_cast(f)}; + fragment.d1() = {1, 2 + static_cast(t)}; + fragment.d2() = {1, 2 + static_cast(f)}; std::get<0>(fragment.atts_) = std::vector{ static_cast(instance.fragments.size()), static_cast(instance.fragments.size())}; @@ -3248,7 +3223,7 @@ TEST_CASE_METHOD( CApiArray array(context(), raw_array, TILEDB_WRITE); for (uint64_t f = 0; f < fragment_same_timestamp_runs[t]; f++, i++) { - write_fragment( + write_fragment( instance.fragments[i], &array); } } @@ -3333,64 +3308,15 @@ TEST_CASE_METHOD( */ template void CSparseGlobalOrderFx::create_array(const Instance& instance) { - const auto dimensions = instance.dimensions(); - const auto attributes = instance.attributes(); - - std::vector dimension_names; - std::vector dimension_types; - std::vector dimension_ranges; - std::vector dimension_extents; - auto add_dimension = [&]( - const templates::Dimension& dimension) { - using CoordType = templates::Dimension::value_type; - dimension_names.push_back("d" + std::to_string(dimension_names.size() + 1)); - dimension_types.push_back(static_cast(D)); - dimension_ranges.push_back( - const_cast(&dimension.domain.lower_bound)); - dimension_extents.push_back(const_cast(&dimension.extent)); - }; - std::apply( - [&](const templates::Dimension&... dimension) { - (add_dimension(dimension), ...); - }, - dimensions); - - std::vector attribute_names; - std::vector attribute_types; - std::vector attribute_cell_val_nums; - std::vector attribute_nullables; - std::vector> attribute_compressors; - auto add_attribute = [&](Datatype datatype, - uint32_t cell_val_num, - bool nullable) { - attribute_names.push_back("a" + std::to_string(attribute_names.size() + 1)); - attribute_types.push_back(static_cast(datatype)); - attribute_cell_val_nums.push_back(cell_val_num); - attribute_nullables.push_back(nullable); - attribute_compressors.push_back(std::make_pair(TILEDB_FILTER_NONE, -1)); - }; - for (const auto& [datatype, cell_val_num, nullable] : attributes) { - add_attribute(datatype, cell_val_num, nullable); - } - - tiledb::test::create_array( - context(), + templates::ddl::create_array( array_name_, - TILEDB_SPARSE, - dimension_names, - dimension_types, - dimension_ranges, - dimension_extents, - attribute_names, - attribute_types, - attribute_cell_val_nums, - attribute_compressors, + Context(context(), false), + instance.dimensions(), + instance.attributes(), instance.tile_order(), instance.cell_order(), instance.tile_capacity(), - instance.allow_duplicates(), - false, - {attribute_nullables}); + instance.allow_duplicates()); } /** @@ -3419,13 +3345,13 @@ DeleteArrayGuard CSparseGlobalOrderFx::run_create(Instance& instance) { // the tile extent is 2 // create_default_array_1d(instance.array); - create_array(instance); + create_array(instance); DeleteArrayGuard arrayguard(context(), array_name_.c_str()); // write all fragments for (auto& fragment : instance.fragments) { - write_fragment(fragment); + write_fragment(fragment); } return arrayguard; @@ -3435,7 +3361,7 @@ template void CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(instance.num_user_cells > 0); - std::decay_t expect; + std::decay_t expect; // for de-duplicating, track the fragment that each coordinate came from // we will use this to select the coordinate from the most recent fragment @@ -3665,19 +3591,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(num_cells == num_cells_bound); } - std::apply( - [&](auto&... field) { - std::apply( - [&](const auto&... field_cursor) { - std::apply( - [&](const auto&... field_size) { - (field.apply_cursor(field_cursor, field_size), ...); - }, - field_sizes); - }, - outcursor); - }, - std::tuple_cat(outdims, outatts)); + templates::query::apply_cursor(out, outcursor, field_sizes); const uint64_t cursor_cells = templates::query::num_cells(out, outcursor); @@ -4014,11 +3928,11 @@ void show(const FxRun2D& instance, std::ostream& os) { os << "\t\t{" << std::endl; os << "\t\t\t\"d1\": [" << std::endl; os << "\t\t\t\t"; - show(fragment.d1_, os); + show(fragment.d1(), os); os << std::endl; os << "\t\t\t\"d2\": [" << std::endl; os << "\t\t\t\t"; - show(fragment.d2_, os); + show(fragment.d2(), os); os << std::endl; os << "\t\t\t], " << std::endl; os << "\t\t\t\"atts\": [" << std::endl; diff --git a/test/support/rapidcheck/array_templates.h b/test/support/rapidcheck/array_templates.h index 41f5ac82495..f2c1dacc232 100644 --- a/test/support/rapidcheck/array_templates.h +++ b/test/support/rapidcheck/array_templates.h @@ -139,7 +139,19 @@ Gen make_coordinate(const templates::Domain& domain) { // whereas the domain upper bound is inclusive. // As a result some contortion is required to deal // with numeric_limits. - if (std::is_signed::value) { + if constexpr (std::is_same_v) { + // NB: poor performance with small domains for sure + return gen::suchThat( + gen::map( + gen::string(), + [](std::string s) { + StringDimensionCoordType v(s.begin(), s.end()); + return v; + }), + [domain](const StringDimensionCoordType& s) { + return domain.lower_bound <= s && s <= domain.upper_bound; + }); + } else if constexpr (std::is_signed::value) { if (int64_t(domain.upper_bound) < std::numeric_limits::max()) { return gen::cast(gen::inRange( int64_t(domain.lower_bound), int64_t(domain.upper_bound + 1))); @@ -185,7 +197,11 @@ Gen> make_fragment_1d( std::apply( [&](std::vector tup_d1, auto... tup_atts) { - coords.values_ = tup_d1; + if constexpr (std::is_same_v) { + coords = query_buffers(tup_d1); + } else { + coords.values_ = tup_d1; + } atts = std::apply( [&](std::vector... att) { return std::make_tuple(query_buffers(att)...); @@ -195,7 +211,7 @@ Gen> make_fragment_1d( stdx::transpose(cells)); return Fragment1D{ - .dim_ = coords, .atts_ = atts}; + std::make_tuple(coords), atts}; }); } @@ -233,13 +249,127 @@ Gen> make_fragment_2d( stdx::transpose(cells)); return Fragment2D{ - .d1_ = coords_d1, .d2_ = coords_d2, .atts_ = atts}; + std::make_tuple(coords_d1, coords_d2), atts}; + }); +} + +template < + DimensionType D1, + DimensionType D2, + DimensionType D3, + AttributeType... Att> +Gen> make_fragment_3d( + bool allow_duplicates, + std::optional> d1, + std::optional> d2, + std::optional> d3) { + auto coord_d1 = + (d1.has_value() ? make_coordinate(d1.value()) : gen::arbitrary()); + auto coord_d2 = + (d2.has_value() ? make_coordinate(d2.value()) : gen::arbitrary()); + auto coord_d3 = + (d3.has_value() ? make_coordinate(d3.value()) : gen::arbitrary()); + + using Cell = std::tuple; + + auto cell = + gen::tuple(coord_d1, coord_d2, coord_d3, gen::arbitrary()...); + + auto uniqueCoords = [](const Cell& cell) { + return std::make_tuple( + std::get<0>(cell), std::get<1>(cell), std::get<2>(cell)); + }; + + auto cells = gen::nonEmpty( + allow_duplicates ? gen::container>(cell) : + gen::uniqueBy>(cell, uniqueCoords)); + + return gen::map(cells, [](std::vector cells) { + std::vector coords_d1; + std::vector coords_d2; + std::vector coords_d3; + std::tuple...> atts; + + std::apply( + [&](std::vector tup_d1, + std::vector tup_d2, + std::vector tup_d3, + auto... tup_atts) { + coords_d1 = tup_d1; + coords_d2 = tup_d2; + coords_d3 = tup_d3; + atts = std::make_tuple(tup_atts...); + }, + stdx::transpose(cells)); + + return Fragment3D{ + std::make_tuple(coords_d1, coords_d2, coords_d3), atts}; }); } -template <> -void show>(const templates::Domain& domain, std::ostream& os) { - os << "[" << domain.lower_bound << ", " << domain.upper_bound << "]"; +void showValue(const templates::Domain& domain, std::ostream& os); +void showValue(const templates::Domain& domain, std::ostream& os); +void showValue(const templates::Domain& domain, std::ostream& os); + +namespace detail { + +/** + * Specialization of `rc::detail::ShowDefault` for `query_buffers` of + * fundamental cell type. + * + * Parameters `A` and `B` are SFINAE which in principle allow less verbose + * alternative paths to providing this custom functionality. + */ +template +struct ShowDefault, A, B> { + static void show(const query_buffers& value, std::ostream& os) { + ::rc::show(value.values_, os); + } +}; + +/** + * Specialization of `rc::detail::ShowDefault` for + * `query_buffers>`. + * + * Parameters `A` and `B` are SFINAE which in principle allow less verbose + * alternative paths to providing this custom functionality. + */ +template +struct ShowDefault>, A, B> { + static void show( + const query_buffers>& value, std::ostream& os) { + std::vector values; + for (uint64_t c = 0; c < value.num_cells(); c++) { + values.push_back(std::string(value[c].begin(), value[c].end())); + } + ::rc::show(values, os); + } +}; + +} // namespace detail + +/** + * Generic logic to for showing a `templates::FragmentType`. + */ +template +void showFragment( + const templates::Fragment& value, + std::ostream& os) { + auto showField = [&](const query_buffers& field) { + os << "\t\t"; + show(field, os); + os << std::endl; + }; + os << "{" << std::endl << "\t\"dimensions\": [" << std::endl; + std::apply( + [&](const auto&... dimension) { (showField(dimension), ...); }, + value.dimensions()); + os << "\t]" << std::endl; + os << "\t\"attributes\": [" << std::endl; + std::apply( + [&](const auto&... attribute) { (showField(attribute), ...); }, + value.attributes()); + os << "\t]" << std::endl << "}" << std::endl; } } // namespace rc diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 7858077b2b7..6e6d4f26da0 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -36,12 +36,14 @@ #include "tiledb.h" #include "tiledb/common/unreachable.h" +#include "tiledb/sm/cpp_api/tiledb" #include "tiledb/sm/query/ast/query_ast.h" #include "tiledb/type/datatype_traits.h" #include "tiledb/type/range/range.h" #include #include +#include #include #include #include @@ -57,6 +59,9 @@ class Dimension; namespace tiledb::test::templates { +using StringDimensionCoordType = std::vector; +using StringDimensionCoordView = std::span; + /** * Adapts a `std::tuple` whose fields are all `GlobalCellCmp` * to itself be `GlobalCellCmp`. @@ -67,26 +72,42 @@ struct global_cell_cmp_std_tuple { : tup_(tup) { } + private: + template + static constexpr tiledb::common::UntypedDatumView static_coord_datum( + const T& field) { + static_assert( + stdx::is_fundamental || + std::is_same_v || + std::is_same_v); + if constexpr (stdx::is_fundamental) { + return UntypedDatumView(&field, sizeof(T)); + } else { + return UntypedDatumView(field.data(), field.size()); + } + } + + template + static tiledb::common::UntypedDatumView try_dimension_datum( + const StdTuple& tup, unsigned dim) { + if (dim == I) { + return static_coord_datum(std::get(tup)); + } else if constexpr (I + 1 < std::tuple_size_v) { + return try_dimension_datum(tup, dim); + } else { + // NB: probably not reachable in practice + throw std::logic_error("Out of bounds access to dimension tuple"); + } + } + + public: tiledb::common::UntypedDatumView dimension_datum( const tiledb::sm::Dimension&, unsigned dim_idx) const { - return std::apply( - [&](const auto&... field) { - size_t sizes[] = {sizeof(std::decay_t)...}; - const void* const ptrs[] = { - static_cast(std::addressof(field))...}; - return UntypedDatumView(ptrs[dim_idx], sizes[dim_idx]); - }, - tup_); + return try_dimension_datum<0>(tup_, dim_idx); } const void* coord(unsigned dim) const { - return std::apply( - [&](const auto&... field) { - const void* const ptrs[] = { - static_cast(std::addressof(field))...}; - return ptrs[dim]; - }, - tup_); + return try_dimension_datum<0>(tup_, dim).content(); } StdTuple tup_; @@ -106,11 +127,12 @@ struct query_buffers {}; * Constrains types which can be used as the physical type of a dimension. */ template -concept DimensionType = requires(const D& coord) { - typename std::is_signed; - { coord < coord } -> std::same_as; - { D(int64_t(coord)) } -> std::same_as; -}; +concept DimensionType = + std::is_same_v or requires(const D& coord) { + typename std::is_signed; + { coord < coord } -> std::same_as; + { D(int64_t(coord)) } -> std::same_as; + }; /** * Constrains types which can be used as the physical type of an attribute. @@ -206,6 +228,20 @@ struct Dimension { value_type extent; }; +template <> +struct Dimension { + using value_type = StringDimensionCoordType; + + Dimension() { + } + + Dimension(const Domain& domain) + : domain(domain) { + } + + std::optional> domain; +}; + template struct static_attribute {}; @@ -436,6 +472,10 @@ struct query_buffers { : values_(cells) { } + query_buffers(std::initializer_list cells) + : values_(cells) { + } + bool operator==(const self_type&) const = default; uint64_t num_cells() const { @@ -481,6 +521,11 @@ struct query_buffers { return *this; } + self_type& operator=(const std::initializer_list& values) { + values_ = values; + return *this; + } + query_field_size_type make_field_size(uint64_t cell_limit) const { return sizeof(T) * std::min(cell_limit, values_.size()); } @@ -1125,42 +1170,139 @@ struct query_buffers>> { } }; -/** - * Data for a one-dimensional array - */ -template -struct Fragment1D { - using DimensionType = D; +template +struct Fragment { + private: + template + struct to_query_buffers { + using value_type = std::tuple...>; + using ref_type = std::tuple&...>; + using const_ref_type = std::tuple&...>; + }; + + template + static to_query_buffers::value_type f_qb_value(std::tuple) { + return std::declval::value_type>(); + } + + template + static to_query_buffers::ref_type f_qb_ref(std::tuple) { + return std::declval::ref_type>(); + } + + template + static to_query_buffers::const_ref_type f_qb_const_ref( + std::tuple) { + return std::declval::const_ref_type>(); + } + + template + using value_tuple_query_buffers = decltype(f_qb_value(std::declval())); + + template + using ref_tuple_query_buffers = decltype(f_qb_ref(std::declval())); + + template + using const_ref_tuple_query_buffers = + decltype(f_qb_const_ref(std::declval())); + + public: + using DimensionTuple = _DimensionTuple; + using AttributeTuple = _AttributeTuple; + + using self_type = Fragment; + + using DimensionBuffers = value_tuple_query_buffers; + using DimensionBuffersRef = ref_tuple_query_buffers; + using DimensionBuffersConstRef = + const_ref_tuple_query_buffers; + + using AttributeBuffers = value_tuple_query_buffers; + using AttributeBuffersRef = ref_tuple_query_buffers; + using AttributeBuffersConstRef = + const_ref_tuple_query_buffers; - query_buffers dim_; - std::tuple...> atts_; + DimensionBuffers dims_; + AttributeBuffers atts_; + + uint64_t num_cells() const { + static_assert( + std::tuple_size::value > 0 || + std::tuple_size::value > 0); + + if constexpr (std::tuple_size::value == 0) { + return std::get<0>(atts_).num_cells(); + } else { + return std::get<0>(dims_).num_cells(); + } + } uint64_t size() const { - return dim_.num_cells(); + return num_cells(); + } + + const DimensionBuffersConstRef dimensions() const { + return std::apply( + [](const auto&... field) { return std::forward_as_tuple(field...); }, + dims_); } - std::tuple&> dimensions() const { - return std::tuple&>(dim_); + DimensionBuffersRef dimensions() { + return std::apply( + [](auto&... field) { return std::forward_as_tuple(field...); }, dims_); } - std::tuple&...> attributes() const { + const AttributeBuffersConstRef attributes() const { return std::apply( - [](const query_buffers&... attribute) { - return std::tuple&...>(attribute...); - }, + [](const auto&... field) { return std::forward_as_tuple(field...); }, atts_); } - std::tuple&> dimensions() { - return std::tuple&>(dim_); + AttributeBuffersRef attributes() { + return std::apply( + [](auto&... field) { return std::forward_as_tuple(field...); }, atts_); } - std::tuple&...> attributes() { - return std::apply( - [](query_buffers&... attribute) { - return std::tuple&...>(attribute...); + void reserve(uint64_t num_cells) { + std::apply( + [num_cells](Ts&... field) { + (field.reserve(num_cells), ...); }, - atts_); + std::tuple_cat(dimensions(), attributes())); + } + + void resize(uint64_t num_cells) { + std::apply( + [num_cells](Ts&... field) { + (field.resize(num_cells), ...); + }, + std::tuple_cat(dimensions(), attributes())); + } + + void extend(const self_type& other) { + std::apply( + [&](Ts&... dst) { + std::apply( + [&](const Us&... src) { (dst.extend(src), ...); }, + std::tuple_cat(other.dimensions(), other.attributes())); + }, + std::tuple_cat(dimensions(), attributes())); + } +}; + +/** + * Data for a one-dimensional array + */ +template +struct Fragment1D : public Fragment, std::tuple> { + using DimensionType = D; + + const query_buffers& dimension() const { + return std::get<0>(this->dimensions()); + } + + query_buffers& dimension() { + return std::get<0>(this->dimensions()); } }; @@ -1168,39 +1310,52 @@ struct Fragment1D { * Data for a two-dimensional array */ template -struct Fragment2D { - query_buffers d1_; - query_buffers d2_; - std::tuple...> atts_; +struct Fragment2D : public Fragment, std::tuple> { + const query_buffers& d1() const { + return std::get<0>(this->dimensions()); + } - uint64_t size() const { - return d1_.num_cells(); + const query_buffers& d2() const { + return std::get<1>(this->dimensions()); } - std::tuple&, const query_buffers&> dimensions() - const { - return std::tuple&, const query_buffers&>( - d1_, d2_); + query_buffers& d1() { + return std::get<0>(this->dimensions()); } - std::tuple&, query_buffers&> dimensions() { - return std::tuple&, query_buffers&>(d1_, d2_); + query_buffers& d2() { + return std::get<1>(this->dimensions()); } +}; - std::tuple&...> attributes() const { - return std::apply( - [](const query_buffers&... attribute) { - return std::tuple&...>(attribute...); - }, - atts_); +/** + * Data for a three-dimensional array + */ +template +struct Fragment3D + : public Fragment, std::tuple> { + const query_buffers& d1() const { + return std::get<0>(this->dimensions()); } - std::tuple&...> attributes() { - return std::apply( - [](query_buffers&... attribute) { - return std::tuple&...>(attribute...); - }, - atts_); + const query_buffers& d2() const { + return std::get<1>(this->dimensions()); + } + + const query_buffers& d3() const { + return std::get<2>(this->dimensions()); + } + + query_buffers& d1() { + return std::get<0>(this->dimensions()); + } + + query_buffers& d2() { + return std::get<1>(this->dimensions()); + } + + query_buffers& d3() { + return std::get<2>(this->dimensions()); } }; @@ -1317,10 +1472,12 @@ namespace query { template auto make_field_sizes( F& fragment, uint64_t cell_limit = std::numeric_limits::max()) { + typename F::DimensionBuffersRef dims = fragment.dimensions(); + typename F::AttributeBuffersRef atts = fragment.attributes(); return [cell_limit](std::tuple fields) { return query_applicator::make_field_sizes( fields, cell_limit); - }(std::tuple_cat(fragment.dimensions(), fragment.attributes())); + }(std::tuple_cat(dims, atts)); } template @@ -1328,6 +1485,31 @@ using fragment_field_sizes_t = decltype(make_field_sizes( std::declval(), std::declval())); +/** + * Apply field cursor and sizes to each field of `fragment`. + */ +template +void apply_cursor( + F& fragment, + const fragment_field_sizes_t& cursor, + const fragment_field_sizes_t& field_sizes) { + typename F::DimensionBuffersRef dims = fragment.dimensions(); + typename F::AttributeBuffersRef atts = fragment.attributes(); + std::apply( + [&](auto&... field) { + std::apply( + [&](const auto&... field_cursor) { + std::apply( + [&](const auto&... field_size) { + (field.apply_cursor(field_cursor, field_size), ...); + }, + field_sizes); + }, + cursor); + }, + std::tuple_cat(dims, atts)); +} + /** * Set buffers on `query` for the tuple of field columns */ @@ -1349,24 +1531,31 @@ void set_fields( std::decay_t, std::tuple_size_v>::value(field_cursors); - [&](std::tuple fields) { - query_applicator::set( - ctx, - query, - split_sizes.first, - fields, - dimension_name, - split_cursors.first); - }(fragment.dimensions()); - [&](std::tuple fields) { - query_applicator::set( - ctx, - query, - split_sizes.second, - fields, - attribute_name, - split_cursors.second); - }(fragment.attributes()); + if constexpr (!std:: + is_same_v>) { + [&](std::tuple fields) { + query_applicator::set( + ctx, + query, + split_sizes.first, + fields, + dimension_name, + split_cursors.first); + }(fragment.dimensions()); + } + + if constexpr (!std:: + is_same_v>) { + [&](std::tuple fields) { + query_applicator::set( + ctx, + query, + split_sizes.second, + fields, + attribute_name, + split_cursors.second); + }(fragment.attributes()); + } } /** @@ -1379,8 +1568,126 @@ uint64_t num_cells(const F& fragment, const auto& field_sizes) { }(std::tuple_cat(fragment.dimensions(), fragment.attributes())); } +/** + * Writes a fragment to an array. + */ +template +void write_fragment( + const Fragment& fragment, + Array& forwrite, + tiledb_layout_t layout = TILEDB_UNORDERED) { + Query query(forwrite); + query.set_layout(layout); + + auto field_sizes = + make_field_sizes(const_cast(fragment)); + templates::query::set_fields( + query.ctx().ptr().get(), + query.ptr().get(), + field_sizes, + const_cast(fragment), + [](unsigned d) { return "d" + std::to_string(d + 1); }, + [](unsigned a) { return "a" + std::to_string(a + 1); }); + + const auto status = query.submit(); + ASSERTER(status == Query::Status::COMPLETE); + + if (layout == TILEDB_GLOBAL_ORDER) { + query.finalize(); + } + + // check that sizes match what we expect + const uint64_t expect_num_cells = fragment.size(); + const uint64_t num_cells = + templates::query::num_cells(fragment, field_sizes); + + ASSERTER(num_cells == expect_num_cells); +} + } // namespace query +namespace ddl { + +/** + * Creates an array with a schema whose dimensions and attributes + * come from the simplified arguments. + * The names of the dimensions are d1, d2, etc. + * The names of the attributes are a1, a2, etc. + */ +template +void create_array( + const std::string& array_name, + const Context& context, + const std::tuple&...> dimensions, + std::vector> attributes, + tiledb_layout_t tile_order, + tiledb_layout_t cell_order, + uint64_t tile_capacity, + bool allow_duplicates) { + std::vector dimension_names; + std::vector dimension_types; + std::vector dimension_ranges; + std::vector dimension_extents; + auto add_dimension = [&]( + const templates::Dimension& dimension) { + using CoordType = templates::Dimension::value_type; + dimension_names.push_back("d" + std::to_string(dimension_names.size() + 1)); + dimension_types.push_back(static_cast(D)); + if constexpr (std::is_same_v) { + dimension_ranges.push_back(nullptr); + dimension_extents.push_back(nullptr); + } else { + dimension_ranges.push_back( + const_cast(&dimension.domain.lower_bound)); + dimension_extents.push_back(const_cast(&dimension.extent)); + } + }; + std::apply( + [&](const templates::Dimension&... dimension) { + (add_dimension(dimension), ...); + }, + dimensions); + + std::vector attribute_names; + std::vector attribute_types; + std::vector attribute_cell_val_nums; + std::vector attribute_nullables; + std::vector> attribute_compressors; + auto add_attribute = [&](Datatype datatype, + uint32_t cell_val_num, + bool nullable) { + attribute_names.push_back("a" + std::to_string(attribute_names.size() + 1)); + attribute_types.push_back(static_cast(datatype)); + attribute_cell_val_nums.push_back(cell_val_num); + attribute_nullables.push_back(nullable); + attribute_compressors.push_back(std::make_pair(TILEDB_FILTER_NONE, -1)); + }; + for (const auto& [datatype, cell_val_num, nullable] : attributes) { + add_attribute(datatype, cell_val_num, nullable); + } + + tiledb::test::create_array( + context.ptr().get(), + array_name, + TILEDB_SPARSE, + dimension_names, + dimension_types, + dimension_ranges, + dimension_extents, + attribute_names, + attribute_types, + attribute_cell_val_nums, + attribute_compressors, + tile_order, + cell_order, + tile_capacity, + allow_duplicates, + false, + {attribute_nullables}); +} + +} // namespace ddl + } // namespace tiledb::test::templates #endif diff --git a/tiledb/common/types/untyped_datum.h b/tiledb/common/types/untyped_datum.h index 1f61e681ea7..51ee98a0f7c 100644 --- a/tiledb/common/types/untyped_datum.h +++ b/tiledb/common/types/untyped_datum.h @@ -41,23 +41,23 @@ class UntypedDatumView { size_t datum_size_; public: - UntypedDatumView(const void* content, size_t size) + constexpr UntypedDatumView(const void* content, size_t size) : datum_content_(content) , datum_size_(size) { } - UntypedDatumView(std::string_view ss) + constexpr UntypedDatumView(std::string_view ss) : datum_content_(ss.data()) , datum_size_(ss.size()) { } - [[nodiscard]] inline const void* content() const { + [[nodiscard]] constexpr inline const void* content() const { return datum_content_; } - [[nodiscard]] inline size_t size() const { + [[nodiscard]] constexpr inline size_t size() const { return datum_size_; } template - [[nodiscard]] inline const T& value_as() const { + [[nodiscard]] constexpr inline const T& value_as() const { return *static_cast(datum_content_); } }; diff --git a/tiledb/sm/cpp_api/array.h b/tiledb/sm/cpp_api/array.h index 5c4bca8c272..03c50aefabd 100644 --- a/tiledb/sm/cpp_api/array.h +++ b/tiledb/sm/cpp_api/array.h @@ -319,6 +319,11 @@ class Array { return std::string(uri); } + /** Get the Context for the array. */ + const Context& context() const { + return ctx_.get(); + } + /** Get the ArraySchema for the array. **/ ArraySchema schema() const { auto& ctx = ctx_.get(); diff --git a/tiledb/sm/cpp_api/query.h b/tiledb/sm/cpp_api/query.h index 2e5308180d2..cbb0eafd361 100644 --- a/tiledb/sm/cpp_api/query.h +++ b/tiledb/sm/cpp_api/query.h @@ -171,6 +171,34 @@ class Query { : Query(ctx, array, array.query_type()) { } + /** + * Creates a TileDB query object. + * + * The context and query type (read or write) are inferred from the array + * object, which was opened with a specific query type. + * + * The storage manager also acquires a **shared lock** on the array. This + * means multiple read and write queries to the same array can be made + * concurrently (in TileDB, only consolidation requires an exclusive lock for + * a short period of time). + * + * **Example:** + * + * @code{.cpp} + * // Open the array for writing + * tiledb::Context ctx; + * tiledb::Array array(ctx, "my_array", TILEDB_WRITE); + * Query query(array); + * // Equivalent to: + * // Query query(ctx, array, TILEDB_WRITE); + * @endcode + * + * @param array Open Array object + */ + Query(const Array& array) + : Query(array.context(), array) { + } + Query(const Query&) = default; Query(Query&&) = default; Query& operator=(const Query&) = default; From abef3a46688e3ca68a195aaccc3980acfd599810 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 5 Nov 2025 10:03:38 -0500 Subject: [PATCH 3/3] bugfix: Fix dense fragment domains during global order write with maximum fragment size (#5655) In CORE-290 a customer reported issues with corrupt arrays after running consolidation. The symptom was memory allocation errors when opening an array. The root cause turned out to be the consolidation itself was writing new fragments where the fragment domain did not match the number of tiles in the fragment. The fragment metadata domain is a bounding rectangle. This means that the global order writer must split the tiles of its input into fragments at tile boundaries which bisect the bounding rectangle into two smaller bounding rectangles. To do so, we add a first pass identify_fragment_tile_boundaries which returns a list of tile offsets where new fragments will begin. Upon finishing a fragment, we use that tile offset to determine which rectangle within the target subarray the fragment actually represents, and update the fragment metadata accordingly. We use new functions is_rectangular_domain to determine whether a (start_tile, num_tiles) pair identifies a rectangle, and domain_tile_offset to compute that rectangle. Much of the complexity comes from the usage of the global order writer which does happen in consolidation: multi-part writes. A user (or a consolidation operation) can set a domain D which it intends to write into, and then actually fill in all of the cells over multiple submit calls which stream in the cells to write. It is not required for these cells to be tile aligned. Because of that, and the need to write rectangle fragments, a single submit cannot always determine whether a tail of tiles belongs to its current fragment or must be deferred to the next. To get around this we keep those tiles in memory in the global_write_state_ and prepend them to the user input in the next submit. --- test/src/unit-cppapi-consolidation.cc | 425 ++++++ test/src/unit-cppapi-max-fragment-size.cc | 1160 +++++++++++++++++ test/src/unit-sparse-global-order-reader.cc | 24 +- test/support/CMakeLists.txt | 3 +- .../rapidcheck/array_schema_templates.h | 198 +++ test/support/rapidcheck/array_templates.h | 138 +- .../rapidcheck/show/array_schema_templates.cc | 74 ++ .../rapidcheck/{show.cc => show/query_ast.cc} | 5 +- test/support/src/array_schema_templates.h | 219 ++++ test/support/src/array_templates.h | 514 +++++--- test/support/src/fragment_info_helpers.h | 166 +++ tiledb/common/arithmetic.h | 14 + tiledb/sm/fragment/fragment_metadata.cc | 59 +- tiledb/sm/fragment/fragment_metadata.h | 11 +- tiledb/sm/query/query.cc | 30 +- tiledb/sm/query/query.h | 4 +- .../sm/query/writers/global_order_writer.cc | 579 ++++++-- tiledb/sm/query/writers/global_order_writer.h | 114 +- tiledb/sm/query/writers/unordered_writer.cc | 2 +- tiledb/sm/query/writers/writer_base.cc | 52 +- tiledb/sm/query/writers/writer_base.h | 56 +- tiledb/sm/tile/arithmetic.h | 228 ++++ tiledb/sm/tile/test/CMakeLists.txt | 3 + tiledb/sm/tile/test/arithmetic.h | 95 ++ tiledb/sm/tile/test/unit_arithmetic.cc | 756 +++++++++++ tiledb/sm/tile/tile.h | 7 + tiledb/sm/tile/writer_tile_tuple.cc | 15 + tiledb/sm/tile/writer_tile_tuple.h | 6 + tiledb/type/range/range.h | 13 + 29 files changed, 4435 insertions(+), 535 deletions(-) create mode 100644 test/support/rapidcheck/array_schema_templates.h create mode 100644 test/support/rapidcheck/show/array_schema_templates.cc rename test/support/rapidcheck/{show.cc => show/query_ast.cc} (93%) create mode 100644 test/support/src/array_schema_templates.h create mode 100644 test/support/src/fragment_info_helpers.h create mode 100644 tiledb/sm/tile/arithmetic.h create mode 100644 tiledb/sm/tile/test/arithmetic.h create mode 100644 tiledb/sm/tile/test/unit_arithmetic.cc diff --git a/test/src/unit-cppapi-consolidation.cc b/test/src/unit-cppapi-consolidation.cc index 950553890c1..31a3db46dc4 100644 --- a/test/src/unit-cppapi-consolidation.cc +++ b/test/src/unit-cppapi-consolidation.cc @@ -32,10 +32,16 @@ #include "tiledb/sm/cpp_api/tiledb_experimental" #include +#include "test/support/src/array_helpers.h" +#include "test/support/src/array_templates.h" +#include "test/support/src/fragment_info_helpers.h" #include "test/support/src/helpers.h" +#include "tiledb/api/c_api/array/array_api_internal.h" #include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/misc/comparators.h" using namespace tiledb; +using namespace tiledb::test; void remove_array(const std::string& array_name) { Context ctx; @@ -538,3 +544,422 @@ TEST_CASE( remove_array(array_name); } + +template +void instance_dense_consolidation_create_array( + Context& ctx, + const std::string& array_name, + const std::vector>& domain) { + using Coord = templates::Dimension
::value_type; + + // create array + Domain arraydomain(ctx); + for (uint64_t d = 0; d < domain.size(); d++) { + const std::string dname = "d" + std::to_string(d + 1); + auto dd = Dimension::create( + ctx, + dname, + {domain[d].domain.lower_bound, domain[d].domain.upper_bound}, + domain[d].extent); + arraydomain.add_dimension(dd); + } + + ArraySchema schema(ctx, TILEDB_DENSE); + schema.set_domain(arraydomain); + + const std::vector> attributes = + templates::ddl::physical_type_attributes(); + for (uint64_t a = 0; a < attributes.size(); a++) { + const std::string aname = "a" + std::to_string(a + 1); + auto aa = Attribute::create( + ctx, + aname, + static_cast(std::get<0>(attributes[a]))) + .set_cell_val_num(std::get<1>(attributes[a])) + .set_nullable(std::get<2>(attributes[a])); + schema.add_attribute(aa); + } + + Array::create(array_name, schema); +} + +/** + * Runs an instance of a dense consolidation test. + * The `fragments` are written in ascending order from the beginning of the + * array domain. + * + * Asserts that after consolidation we get fragments which appropriately satisfy + * `max_fragment_size`: + * 1) no fragment is larger than that size + * 2) if the union of two adjacent fragments can form a rectangular domain, then + * the sum of their sizes must exceed the maximum fragment size (else they + * should be one fragment) + * + * @precondition the `fragments` each have a number of cells which is an + * integral number of tiles + */ +template < + sm::Datatype DT, + templates::FragmentType F, + typename Asserter = AsserterCatch> +std::vector::domain_type>> +instance_dense_consolidation( + Context& ctx, + const std::string& array_name, + const std::vector>& domain, + std::vector& fragments, + uint64_t max_fragment_size) { + using Coord = templates::Dimension
::value_type; + + static constexpr sm::Layout tile_order = sm::Layout::ROW_MAJOR; + + // create array + instance_dense_consolidation_create_array(ctx, array_name, domain); + + DeleteArrayGuard arrayguard(ctx.ptr().get(), array_name.c_str()); + + sm::NDRange array_domain; + for (const auto& dim : domain) { + array_domain.push_back( + Range(dim.domain.lower_bound, dim.domain.upper_bound)); + } + + uint64_t num_cells_per_tile = 1; + std::vector tile_extents; + for (const auto& dim : domain) { + tile_extents.push_back(dim.extent); + num_cells_per_tile *= static_cast(dim.extent); + } + + // populate array + uint64_t start_tile = 0; + { + Array forwrite(ctx, array_name, TILEDB_WRITE); + for (auto& f : fragments) { + const uint64_t f_num_tiles = f.num_cells() / num_cells_per_tile; + + const std::optional subarray = domain_tile_offset( + tile_order, tile_extents, array_domain, start_tile, f_num_tiles); + ASSERTER(subarray.has_value()); + + templates::query::write_fragment( + f, forwrite, subarray.value()); + + start_tile += f_num_tiles; + } + } + + sm::NDRange non_empty_domain; + { + std::optional maybe = domain_tile_offset( + tile_order, tile_extents, array_domain, 0, start_tile); + ASSERTER(maybe.has_value()); + non_empty_domain = maybe.value(); + } + + // consolidate + Config cconfig; + cconfig["sm.consolidation.max_fragment_size"] = + std::to_string(max_fragment_size); + Array::consolidate(ctx, array_name, &cconfig); + + Array forread(ctx, array_name, TILEDB_READ); + + // sanity check the non-empty domain + // NB: cannot use `==` for some reason, the array `non_empty_domain` method + // returns `range_start_size_` zero + { + const auto actual_domain = forread.ptr()->array()->non_empty_domain(); + for (uint64_t d = 0; d < domain.size(); d++) { + ASSERTER( + non_empty_domain[d].start_as() == + actual_domain[d].start_as()); + ASSERTER( + non_empty_domain[d].end_as() == + actual_domain[d].end_as()); + } + } + + // check fragment info + FragmentInfo finfo(ctx, array_name); + finfo.load(); + + const auto fragment_domains = + collect_and_validate_fragment_domains( + ctx, + tile_order, + array_name, + tile_extents, + non_empty_domain, + max_fragment_size); + + // read back fragments to check contents + std::vector api_subarray; + api_subarray.reserve(2 * domain.size()); + for (uint64_t d = 0; d < domain.size(); d++) { + api_subarray.push_back(non_empty_domain[d].start_as()); + api_subarray.push_back(non_empty_domain[d].end_as()); + } + + F input_concatenated, output; + for (const auto& f : fragments) { + input_concatenated.extend(f); + } + + // sort in global order + { + std::vector idxs(input_concatenated.size()); + std::iota(idxs.begin(), idxs.end(), 0); + + std::vector next_coord; + next_coord.reserve(domain.size()); + for (uint64_t d = 0; d < domain.size(); d++) { + next_coord.push_back(domain[d].domain.lower_bound); + } + + std::vector> coords; + coords.reserve(input_concatenated.size()); + for (uint64_t i = 0; i < input_concatenated.size(); i++) { + coords.push_back(next_coord); + for (uint64_t di = 0; di < domain.size(); di++) { + const uint64_t d = domain.size() - di - 1; + if (next_coord[d] < domain[d].domain.upper_bound) { + ++next_coord[d]; + break; + } else { + next_coord[d] = 0; + } + } + } + + sm::GlobalCellCmp globalcmp( + forread.ptr()->array()->array_schema_latest().domain()); + + auto icmp = [&](uint64_t ia, uint64_t ib) -> bool { + const auto sa = templates::global_cell_cmp_span(coords[ia]); + const auto sb = templates::global_cell_cmp_span(coords[ib]); + return globalcmp(sa, sb); + }; + + std::sort(idxs.begin(), idxs.end(), icmp); + + input_concatenated.attributes() = stdx::select( + stdx::reference_tuple(input_concatenated.attributes()), + std::span(idxs)); + } + + output = input_concatenated; + + Subarray sub(ctx, forread); + sub.set_subarray(api_subarray); + + Query query(forread.context(), forread); + query.set_layout(TILEDB_GLOBAL_ORDER); + query.set_subarray(sub); + + // make field size locations + templates::query::fragment_field_sizes_t field_sizes = + templates::query::make_field_sizes(output, output.num_cells()); + + // add fields to query + auto outcursor = templates::query::fragment_field_sizes_t(); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + output, + [](unsigned d) { return "d" + std::to_string(d + 1); }, + [](unsigned a) { return "a" + std::to_string(a + 1); }, + outcursor); + + const auto status = query.submit(); + ASSERTER(status == Query::Status::COMPLETE); + + // resize according to what was found + templates::query::apply_cursor(output, outcursor, field_sizes); + + ASSERTER(output == input_concatenated); + + return fragment_domains; +} + +/** + * Test case inspired by CORE-290. + * + */ +TEST_CASE( + "C++ API: Test consolidation dense array with max fragment size", + "[cppapi][consolidation][rest]") { + using Dim64 = templates::Dimension; + using Dom64 = Dim64::domain_type; + using DenseFragmentFixed = templates::Fragment, std::tuple>; + + const std::string array_name = "cppapi_consolidation_dense"; + + Context ctx; + + SECTION("2D") { + SECTION("Row tiles") { + const Dim64 row(0, std::numeric_limits::max() - 1, 1); + const Dim64 col(0, 99999, 100000); + + const uint64_t num_fragments = 32; + + // each input fragment is a single row + std::vector input_fragments; + for (uint64_t f = 0; f < num_fragments; f++) { + DenseFragmentFixed fdata; + fdata.resize(row.extent * col.domain.num_cells()); + + auto& att = std::get<0>(fdata.attributes()); + std::iota( + att.begin(), att.end(), static_cast(f) * fdata.num_cells()); + + input_fragments.push_back(fdata); + } + + // unfiltered, each row takes `100000 * sizeof(int)` bytes, plus some + // padding + const uint64_t tile_size = (row.extent * col.extent * sizeof(int)) + 92; + const uint64_t max_fragment_size = GENERATE_COPY( + tile_size - 1, tile_size, (2 * tile_size) - 1, 2 * tile_size); + + const uint64_t rows_per_fragment = max_fragment_size / tile_size; + DYNAMIC_SECTION( + "max_fragment_size = " + std::to_string(max_fragment_size)) { + if (rows_per_fragment == 0) { + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + REQUIRE_THROWS( + instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, + array_name, + {row, col}, + input_fragments, + max_fragment_size), + expect); + } else { + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, max_fragment_size); + + std::vector> expect; + for (uint64_t r = 0; r < num_fragments; r += rows_per_fragment) { + expect.push_back({Dom64(r, r + rows_per_fragment - 1), col.domain}); + } + CHECK(output_fragments == expect); + } + } + } + + SECTION("Rectangle tiles") { + const Dim64 row(0, std::numeric_limits::max() - 1, 4); + const Dim64 col(0, 99999, 100000 / row.extent); + + const uint64_t num_fragments = 32; + + // each input fragment is 4 tiles, covering 4 rows of cells + std::vector input_fragments; + for (uint64_t f = 0; f < num_fragments; f++) { + DenseFragmentFixed fdata; + fdata.resize(row.extent * col.extent * row.extent); + + auto& att = std::get<0>(fdata.attributes()); + std::iota( + att.begin(), att.end(), static_cast(f) * fdata.num_cells()); + + input_fragments.push_back(fdata); + } + + // unfiltered, each row takes `100000 * sizeof(int)` bytes, plus some + // padding + const uint64_t tile_size = (row.extent * col.extent * sizeof(int)) + 92; + + SECTION("Too small") { + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + REQUIRE_THROWS( + instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, tile_size - 1), + expect); + } + SECTION("One tile") { + std::vector> expect; + for (uint64_t r = 0; r < num_fragments; r++) { + for (uint64_t c = 0; c < 4; c++) { + expect.push_back( + {Dom64(r * 4, r * 4 + 3), + Dom64(col.extent * c, (col.extent * (c + 1)) - 1)}); + } + } + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, tile_size); + CHECK(output_fragments == expect); + } + SECTION("Two tiles") { + std::vector> expect; + for (uint64_t r = 0; r < num_fragments; r++) { + expect.push_back( + {Dom64(r * 4, r * 4 + 3), Dom64(0, (col.extent * 2) - 1)}); + expect.push_back( + {Dom64(r * 4, r * 4 + 3), + Dom64(col.extent * 2, (col.extent * 4) - 1)}); + } + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, 2 * tile_size); + } + SECTION("Three tiles") { + // now we have some trouble, each row is 4 tiles, 3 of them fit, + // so we will alternate fragments with 3 tiles and fragments with 1 + // tile to fill out the row, yikes + std::vector> expect; + for (uint64_t r = 0; r < num_fragments; r++) { + expect.push_back( + {Dom64(r * 4, r * 4 + 3), Dom64(0, (col.extent * 3) - 1)}); + expect.push_back( + {Dom64(r * 4, r * 4 + 3), + Dom64(col.extent * 3, (col.extent * 4) - 1)}); + } + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, 3 * tile_size); + CHECK(output_fragments == expect); + } + SECTION("Four tiles") { + std::vector> expect; + for (uint64_t f = 0; f < num_fragments; f++) { + expect.push_back({Dom64(f * 4, f * 4 + 3), col.domain}); + } + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, 4 * tile_size); + CHECK(output_fragments == expect); + } + SECTION("Five tiles") { + // since we need rectangle domains this is the same as four tiles + std::vector> expect; + for (uint64_t f = 0; f < num_fragments; f++) { + expect.push_back({Dom64(f * 4, f * 4 + 3), col.domain}); + } + const auto output_fragments = instance_dense_consolidation< + sm::Datatype::UINT64, + DenseFragmentFixed>( + ctx, array_name, {row, col}, input_fragments, 5 * tile_size); + CHECK(output_fragments == expect); + } + } + } +} diff --git a/test/src/unit-cppapi-max-fragment-size.cc b/test/src/unit-cppapi-max-fragment-size.cc index dd79e638fe4..a5d5f883153 100644 --- a/test/src/unit-cppapi-max-fragment-size.cc +++ b/test/src/unit-cppapi-max-fragment-size.cc @@ -30,17 +30,34 @@ * Tests the C++ API for maximum fragment size. */ +#include #include +#include +#include "test/support/rapidcheck/array_templates.h" +#include "test/support/src/array_helpers.h" +#include "test/support/src/array_templates.h" +#include "test/support/src/fragment_info_helpers.h" #include "test/support/src/helpers.h" +#include "test/support/src/vfs_helpers.h" +#include "tiledb/api/c_api/array_schema/array_schema_api_internal.h" +#include "tiledb/api/c_api/fragment_info/fragment_info_api_internal.h" +#include "tiledb/api/c_api/subarray/subarray_api_internal.h" +#include "tiledb/common/arithmetic.h" #include "tiledb/common/scoped_executor.h" #include "tiledb/common/stdx_string.h" #include "tiledb/sm/c_api/tiledb_struct_def.h" #include "tiledb/sm/cpp_api/tiledb" #include "tiledb/sm/misc/constants.h" +#include "tiledb/sm/query/writers/global_order_writer.h" +#include "tiledb/sm/tile/arithmetic.h" +#include "tiledb/sm/tile/test/arithmetic.h" +#include "tiledb/sm/tile/tile.h" #include +#include using namespace tiledb; +using namespace tiledb::test; struct CPPMaxFragmentSizeFx { const int max_domain = 1000000; @@ -503,3 +520,1146 @@ TEST_CASE( array.close(); } + +/** + * @return the number of cells contained within a subarray, or `std::nullopt` if + * overflow + */ +std::optional subarray_num_cells( + std::span> subarray) { + uint64_t num_cells = 1; + for (const auto& dim : subarray) { + auto maybe = checked_arithmetic::mul(num_cells, dim.num_cells()); + if (!maybe.has_value()) { + return std::nullopt; + } + num_cells = maybe.value(); + } + return num_cells; +} + +/** + * Creates an array with the provided `dimensions` and then + * runs a global order write into `subarray` using `max_fragment_size` to bound + * the fragment size. + * + * Asserts that all created fragments respect `max_fragment_size` and that the + * data read back out for `subarray` matches what we wrote into it. + * + * @return a list of the domains written to each fragment in ascending order + */ +template +std::vector>> +instance_dense_global_order( + const Context& ctx, + const std::string& array_name, + tiledb_layout_t tile_order, + tiledb_layout_t cell_order, + uint64_t max_fragment_size, + const std::vector>& dimensions, + const std::vector>& subarray, + const F& attributes, + std::optional write_unit_num_cells = std::nullopt) { + Domain domain(ctx); + for (uint64_t d = 0; d < dimensions.size(); d++) { + const std::string dname = "d" + std::to_string(d); + auto dim = Dimension::create( + ctx, + dname, + {{dimensions[d].domain.lower_bound, dimensions[d].domain.upper_bound}}, + dimensions[d].extent); + domain.add_dimension(dim); + } + + ArraySchema schema(ctx, TILEDB_DENSE); + schema.set_domain(domain); + schema.set_tile_order(tile_order); + schema.set_cell_order(cell_order); + + const std::vector> ddl_attributes = + templates::ddl::physical_type_attributes(); + for (uint64_t a = 0; a < ddl_attributes.size(); a++) { + const std::string aname = "a" + std::to_string(a + 1); + auto aa = + Attribute::create( + ctx, + aname, + static_cast(std::get<0>(ddl_attributes[a]))) + .set_cell_val_num(std::get<1>(ddl_attributes[a])) + .set_nullable(std::get<2>(ddl_attributes[a])); + schema.add_attribute(aa); + } + + Array::create(array_name, schema); + test::DeleteArrayGuard del(ctx.ptr().get(), array_name.c_str()); + + std::vector api_subarray; + api_subarray.reserve(2 * subarray.size()); + for (const auto& sub_dim : subarray) { + api_subarray.push_back(sub_dim.lower_bound); + api_subarray.push_back(sub_dim.upper_bound); + } + + std::vector tile_extents; + for (const auto& dimension : dimensions) { + tile_extents.push_back(dimension.extent); + } + + sm::NDRange smsubarray; + + // write data, should be split into multiple fragments + templates::query::fragment_field_sizes_t cursor; + { + Array array(ctx, array_name, TILEDB_WRITE); + + Subarray sub(ctx, array); + sub.set_subarray(api_subarray); + + Query query(ctx, array, TILEDB_WRITE); + query.set_layout(TILEDB_GLOBAL_ORDER); + query.set_subarray(sub); + query.ptr().get()->query_->set_fragment_size(max_fragment_size); + + smsubarray = sub.ptr()->subarray()->ndrange(0); + + sm::NDRange smsubarray_aligned = smsubarray; + array.schema() + .ptr() + ->array_schema() + ->domain() + .expand_to_tiles_when_no_current_domain(smsubarray_aligned); + + uint64_t cells_written = 0; + while (templates::query::num_cells(attributes, cursor) < + attributes.num_cells()) { + const uint64_t cells_this_write = std::min( + attributes.num_cells() - cells_written, + write_unit_num_cells.value_or(attributes.num_cells())); + + const F attributes_this_write = + attributes.slice(cells_written, cells_this_write); + + auto field_sizes = templates::query::make_field_sizes( + attributes_this_write, cells_this_write); + templates::query::accumulate_cursor( + attributes_this_write, cursor, field_sizes); + + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + const_cast(attributes_this_write), + [](unsigned d) { return "d" + std::to_string(d + 1); }, + [](unsigned a) { return "a" + std::to_string(a + 1); }); + + const auto status = query.submit(); + ASSERTER(status == Query::Status::COMPLETE); + + const uint64_t cells_written_this_write = + templates::query::num_cells( + attributes_this_write, field_sizes); + ASSERTER(cells_written_this_write == cells_this_write); + + cells_written += cells_written_this_write; + ASSERTER( + cells_written == + templates::query::num_cells(attributes, cursor)); + + const auto w = dynamic_cast( + query.ptr()->query_->strategy()); + ASSERTER(w); + const auto g = w->get_global_state(); + ASSERTER(g); + + // Check assumptions about memory buffering. + // There may be a tail of tiles for which we cannot infer whether they + // would fit in the current fragment while also forming a rectangle. + // The writer keeps these in memory until it has enough information + // in the next `submit`. Check our assumptions about those tiles. + uint64_t in_memory_size = 0; + std::optional in_memory_num_tiles; + for (const auto& field : g->last_tiles_) { + // NB: there should always be at least one tile which contains the + // state of the current fragment + ASSERTER(!field.second.empty()); + + for (uint64_t t = 0; t < field.second.size() - 1; t++) { + const auto s = field.second[t].filtered_size(); + ASSERTER(s.has_value()); + in_memory_size += s.value(); + } + + if (in_memory_num_tiles.has_value()) { + ASSERTER(field.second.size() - 1 == in_memory_num_tiles.value()); + } else { + in_memory_num_tiles = field.second.size() - 1; + } + } + // it should be an error if they exceed the max fragment size + ASSERTER(in_memory_size <= max_fragment_size); + + // and if they form a rectangle then we could have written some out + ASSERTER(in_memory_num_tiles.has_value()); + for (uint64_t num_tiles = 0; num_tiles < in_memory_num_tiles.value(); + num_tiles++) { + const sm::IsRectangularDomain rectangle = + sm::is_rectangular_domain( + static_cast(tile_order), + tile_extents, + smsubarray_aligned, + g->dense_.domain_tile_offset_, + g->frag_meta_->tile_index_base() + num_tiles); + if (num_tiles == 0) { + ASSERTER(rectangle == sm::IsRectangularDomain::Yes); + } else { + // if `Never` then we should have started a new fragment + // to avoid buffering up until we hit the tile size + ASSERTER(rectangle == sm::IsRectangularDomain::No); + } + } + } + + query.finalize(); + } + + // then read back + F read; + { + templates::query::resize(read, cursor); + + Array array(ctx, array_name, TILEDB_READ); + + Subarray sub(ctx, array); + sub.set_subarray(api_subarray); + + Query query(ctx, array, TILEDB_READ); + query.set_layout(TILEDB_GLOBAL_ORDER); + query.set_subarray(sub); + + auto read_field_sizes = + templates::query::make_field_sizes(read); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + read_field_sizes, + read, + [](unsigned d) { return "d" + std::to_string(d + 1); }, + [](unsigned a) { return "a" + std::to_string(a + 1); }); + + auto st = query.submit(); + ASSERTER(st == Query::Status::COMPLETE); + + ASSERTER(read_field_sizes == cursor); + } + + const std::vector>> fragment_domains = + collect_and_validate_fragment_domains( + ctx, + static_cast(tile_order), + array_name, + tile_extents, + smsubarray, + max_fragment_size); + + // this is last because a fragment domain mismatch is more informative + ASSERTER(read == attributes); + + return fragment_domains; +} + +template +std::vector>> +instance_dense_global_order( + const Context& ctx, + const std::string& array_name, + tiledb_layout_t tile_order, + tiledb_layout_t cell_order, + uint64_t max_fragment_size, + const std::vector>& dimensions, + const std::vector>& subarray, + std::optional write_unit_num_cells = std::nullopt) { + const std::optional num_cells = subarray_num_cells(subarray); + ASSERTER(num_cells.has_value()); + + const int a_offset = 77; + std::vector a_write; + a_write.reserve(num_cells.value()); + for (int i = 0; i < static_cast(num_cells.value()); i++) { + a_write.push_back(a_offset + i); + } + + templates::Fragment, std::tuple> attributes; + std::get<0>(attributes.attributes()) = a_write; + + return instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray, + attributes, + write_unit_num_cells); +} + +/** + * Tests that the max fragment size parameter is properly respected + * for global order writes to dense arrays. + */ +TEST_CASE("C++ API: Max fragment size dense array", "[cppapi][max-frag-size]") { + VFSTestSetup vfs; + Context ctx(vfs.ctx()); + + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order"); + + const tiledb_layout_t tile_order = + GENERATE(TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR); + const tiledb_layout_t cell_order = + GENERATE(TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR); + + DYNAMIC_SECTION( + "tile_order = " << sm::layout_str(static_cast(tile_order)) + << ", cell_order = " + << sm::layout_str(static_cast(cell_order))) { + // each tile is a full row of a 2D array + // NB: since each tile is a whole row we observe the same results regardless + // of tile order + SECTION("Row tiles") { + using Dim = templates::Dimension; + using Dom = templates::Domain; + + constexpr uint64_t max_fragment_size = 64 * 1024; + + constexpr size_t span_d2 = 10000; + const std::vector dimensions = { + Dim(0, std::numeric_limits::max() - 1, 1), + Dim(0, span_d2 - 1, span_d2)}; + + const uint64_t base_d1 = 12345; + const uint64_t num_rows = GENERATE(1, 2, 4, 8); + const std::vector subarray = { + Dom(base_d1 + 0, base_d1 + num_rows - 1), Dom(0, span_d2 - 1)}; + + const uint64_t write_unit_num_cells = GENERATE(0, 64, 1024, 1024 * 1024); + + DYNAMIC_SECTION( + "num_rows = " << num_rows << ", write_unit_num_cells = " + << write_unit_num_cells) { + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray, + write_unit_num_cells == 0 ? + std::nullopt : + std::optional{write_unit_num_cells}); + + std::vector> expect; + for (uint64_t r = 0; r < num_rows; r++) { + expect.push_back( + {Dom(base_d1 + r, base_d1 + r), Dom(0, span_d2 - 1)}); + } + + CHECK(expect == actual); + } + } + + // each tile is some rectangle of a 2D array + SECTION("Rectangle tiles") { + using Dim = templates::Dimension; + using Dom = templates::Domain; + + const uint64_t d1_extent = GENERATE(8, 4); + constexpr size_t d2_span = 10000; + REQUIRE(d2_span % d1_extent == 0); // for row major + + const uint64_t d1_subarray = 16; + REQUIRE(d2_span % d1_subarray == 0); // for column major + + const std::vector dimensions = { + Dim(0, std::numeric_limits::max() - 1, d1_extent), + Dim(0, d2_span - 1, d2_span / d1_extent)}; + + const uint64_t d1_start_offset = GENERATE(0, 1); + const uint64_t d1_end_offset = GENERATE(0, 1); + const uint64_t d1_start = 100 + d1_start_offset; + const uint64_t d1_end = d1_start + d1_subarray - 1 - d1_end_offset; + const std::vector subarray = { + Dom(d1_start, d1_end), Dom(0, d2_span - 1)}; + + const uint64_t max_fragment_size = 4 * 64 * 1024; + + const uint64_t write_unit_num_cells = GENERATE(0, 64, 1024, 1024 * 1024); + + DYNAMIC_SECTION( + "start_offset = " + << d1_start_offset << ", end_offset = " << d1_end_offset + << ", extent = " << d1_extent + << ", write_unit_num_cells = " << write_unit_num_cells) { + if (d1_extent == 8) { + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + REQUIRE_THROWS( + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray), + expect); + } else if (d1_start_offset + d1_end_offset > 0) { + // if this constraint is ever relaxed this test must be extended + // with new inputs which are offset within a tile + const auto expect = Catch::Matchers::ContainsSubstring( + "the subarray must coincide with the tile bounds"); + REQUIRE_THROWS( + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray, + write_unit_num_cells == 0 ? + std::nullopt : + std::optional(write_unit_num_cells)), + expect); + } else { + std::vector> expect; + if (tile_order == TILEDB_ROW_MAJOR) { + expect = { + {Dom(d1_start + 0 * d1_extent, d1_start + 1 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {Dom(d1_start + 1 * d1_extent, d1_start + 2 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {Dom(d1_start + 2 * d1_extent, d1_start + 3 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {Dom(d1_start + 3 * d1_extent, d1_start + 4 * d1_extent - 1), + Dom(0, d2_span - 1)}}; + } else { + expect = { + {Dom(d1_start, d1_start + d1_subarray - 1), + Dom(0 * (d2_span / 4), 1 * (d2_span / 4) - 1)}, + {Dom(d1_start, d1_start + d1_subarray - 1), + Dom(1 * (d2_span / 4), 2 * (d2_span / 4) - 1)}, + {Dom(d1_start, d1_start + d1_subarray - 1), + Dom(2 * (d2_span / 4), 3 * (d2_span / 4) - 1)}, + {Dom(d1_start, d1_start + d1_subarray - 1), + Dom(3 * (d2_span / 4), 4 * (d2_span / 4) - 1)}, + }; + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray); + + CHECK(expect == actual); + } + } + } + + // Each tile is a rectangular prism of height 1 + // Use the same inputs as above except there is a third outer dimension with + // extent 1 + SECTION("Flat rectangular prism tiles") { + using Dim = templates::Dimension; + using Dom = templates::Domain; + + const uint64_t d0_extent = 1; + const Dom d0_height(0, 0); + + const uint64_t d1_extent = GENERATE(8, 4); + constexpr size_t d2_span = 10000; + REQUIRE(d2_span % d1_extent == 0); // for row major + + const uint64_t d1_subarray = 16; + REQUIRE(d2_span % d1_subarray == 0); // for column major + + const std::vector dimensions = { + Dim(0, std::numeric_limits::max() - 1, d0_extent), + Dim(0, std::numeric_limits::max() - 1, d1_extent), + Dim(0, d2_span - 1, d2_span / d1_extent)}; + + const uint64_t d1_start_offset = GENERATE(0, 1); + const uint64_t d1_end_offset = GENERATE(0, 1); + const uint64_t d1_start = 100 + d1_start_offset; + const uint64_t d1_end = d1_start + d1_subarray - 1 - d1_end_offset; + const std::vector subarray = { + d0_height, Dom(d1_start, d1_end), Dom(0, d2_span - 1)}; + + const uint64_t max_fragment_size = 4 * 64 * 1024; + + const uint64_t write_unit_num_cells = GENERATE(0, 64, 1024, 1024 * 1024); + + DYNAMIC_SECTION( + "start_offset = " + << d1_start_offset << ", end_offset = " << d1_end_offset + << ", extent = " << d1_extent + << ", write_unit_num_cells = " << write_unit_num_cells) { + if (d1_extent == 8) { + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + REQUIRE_THROWS( + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray), + expect); + } else if (d1_start_offset + d1_end_offset > 0) { + // if this constraint is ever relaxed this test must be extended + // with new inputs which are offset within a tile + const auto expect = Catch::Matchers::ContainsSubstring( + "the subarray must coincide with the tile bounds"); + REQUIRE_THROWS(instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray, + write_unit_num_cells == 0 ? + std::nullopt : + std::optional(write_unit_num_cells))); + } else { + std::vector> expect; + if (tile_order == TILEDB_ROW_MAJOR) { + expect = { + {d0_height, + Dom(d1_start + 0 * d1_extent, d1_start + 1 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {d0_height, + Dom(d1_start + 1 * d1_extent, d1_start + 2 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {d0_height, + Dom(d1_start + 2 * d1_extent, d1_start + 3 * d1_extent - 1), + Dom(0, d2_span - 1)}, + {d0_height, + Dom(d1_start + 3 * d1_extent, d1_start + 4 * d1_extent - 1), + Dom(0, d2_span - 1)}}; + } else { + expect = { + {d0_height, + Dom(d1_start, d1_start + d1_subarray - 1), + Dom(0 * (d2_span / 4), 1 * (d2_span / 4) - 1)}, + {d0_height, + Dom(d1_start, d1_start + d1_subarray - 1), + Dom(1 * (d2_span / 4), 2 * (d2_span / 4) - 1)}, + {d0_height, + Dom(d1_start, d1_start + d1_subarray - 1), + Dom(2 * (d2_span / 4), 3 * (d2_span / 4) - 1)}, + {d0_height, + Dom(d1_start, d1_start + d1_subarray - 1), + Dom(3 * (d2_span / 4), 4 * (d2_span / 4) - 1)}, + }; + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray); + + CHECK(expect == actual); + } + } + } + } + + // examples found from the rapidcheck test + SECTION("Shrinking") { + using Dim = templates::Dimension; + using Dom = templates::Domain; + + SECTION("Example 1") { + Dim d1(0, 0, 1); + Dim d2(0, 0, 1); + Dom s1(0, 0); + Dom s2(0, 0); + const uint64_t max_fragment_size = 24; + + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {d1, d2}, + {s1, s2}); + } + + SECTION("Example 2") { + Dim d1(1, 26, 2); + Dim d2(0, 0, 1); + Dom s1(1, 2); + Dom s2(0, 0); + const uint64_t max_fragment_size = 28; + + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {d1, d2}, + {s1, s2}); + } + } +} + +/** + * @return a generator which prdocues subarrays whose bounds are aligned to the + * tiles of `arraydomain` + */ +namespace rc { +template +Gen::domain_type>> +make_tile_aligned_subarray( + const std::vector>& arraydomain) { + using Dom = typename templates::Dimension::domain_type; + + // dense subarrays have to be aligned to tile boundaries + // so choose the tiles in each dimension that the subarray will overlap + std::vector>> gen_subarray_tiles; + for (const auto& dimension : arraydomain) { + const uint64_t tile_ub = + (dimension.domain.upper_bound - dimension.domain.lower_bound) / + dimension.extent; + gen_subarray_tiles.push_back(make_range( + templates::Domain(0, std::min(64, tile_ub)))); + } + + return gen::exec([gen_subarray_tiles, arraydomain]() { + std::vector> subarray_tiles; + for (const auto& gen_dim : gen_subarray_tiles) { + subarray_tiles.push_back(*gen_dim); + } + + std::vector subarray; + auto to_subarray = [&]() -> std::vector& { + subarray.clear(); + for (uint64_t d = 0; d < arraydomain.size(); d++) { + subarray.push_back(Dom( + arraydomain[d].domain.lower_bound + + subarray_tiles[d].lower_bound * arraydomain[d].extent, + arraydomain[d].domain.lower_bound + + (subarray_tiles[d].upper_bound + 1) * arraydomain[d].extent - + 1)); + } + return subarray; + }; + + uint64_t num_cells_per_tile = 1; + for (const auto& dim : arraydomain) { + num_cells_per_tile *= dim.extent; + } + + // clamp to a hopefully reasonable limit (if the other attempts failed) + // avoid too many cells, and avoid too many tiles + std::optional num_cells; + while (!(num_cells = subarray_num_cells(to_subarray())).has_value() || + num_cells.value() >= 1024 * 1024 * 4 || + (num_cells.value() / num_cells_per_tile) >= 16 * 1024) { + for (uint64_t d = subarray.size(); d > 0; --d) { + auto& dtiles = subarray_tiles[d - 1]; + if (dtiles.num_cells() > 4) { + dtiles.upper_bound = (dtiles.lower_bound + dtiles.upper_bound) / 2; + break; + } + } + } + + return to_subarray(); + }); +} + +} // namespace rc + +/** + * Generates an arbitrary expected-to-not-error input to + * `instance_dense_global_order` of an appropriate size for the given + * `dimensions`. + * + * "Appropriate size" means tiles with at most `1024 * 128` cells, and a write + * domain with at most `1024 * 1024 * 4` cells (see + * `make_tile_aligned_subarray`). We expect that this should allow inputs which + * are large enough to be interesting but not so large that each instance takes + * a long time. + * + * Inputs generated by this test function are expected to successfully write + * fragments within the generated max fragment size. The maximum fragment size + * is a number of bytes which represents between 1 and 8 hyperrows. + */ +template +void rapidcheck_dense_array( + Context& ctx, + const std::string& array_name, + const std::vector>& dimensions) { + uint64_t num_cells_per_tile = 1; + for (const auto& dim : dimensions) { + num_cells_per_tile *= dim.extent; + } + RC_PRE(num_cells_per_tile <= 1024 * 128); + + const tiledb_layout_t tile_order = + *rc::gen::element(TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR); + const tiledb_layout_t cell_order = + *rc::gen::element(TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR); + + const uint64_t tile_size = num_cells_per_tile * sizeof(int); + const uint64_t filter_chunk_size = + sm::WriterTile::compute_chunk_size(tile_size, sizeof(int)); + const uint64_t num_filter_chunks_per_tile = + (tile_size + filter_chunk_size - 1) / filter_chunk_size; + + const uint64_t estimate_single_tile_fragment_size = + num_cells_per_tile * sizeof(int) // data + + sizeof(uint64_t) // prefix containing the number of chunks + + num_filter_chunks_per_tile * 3 * sizeof(uint32_t); // chunk sizes + + const auto subarray = + *rc::make_tile_aligned_subarray(dimensions); + + uint64_t num_tiles_per_hyperrow = 1; + for (uint64_t i = 0; i < dimensions.size() - 1; i++) { + const uint64_t dim = + (tile_order == TILEDB_ROW_MAJOR ? i + 1 : dimensions.size() - i - 2); + num_tiles_per_hyperrow *= dimensions[dim].num_tiles(subarray[dim]); + } + + const uint64_t num_tiles_total = + num_tiles_per_hyperrow * + (tile_order == TILEDB_ROW_MAJOR ? + (dimensions[0].num_tiles(subarray[0])) : + (dimensions.back().num_tiles(subarray.back()))); + + auto gen_fragment_size = rc::gen::inRange( + estimate_single_tile_fragment_size, + num_tiles_per_hyperrow * estimate_single_tile_fragment_size * 8); + const uint64_t max_fragment_size = *gen_fragment_size; + + auto gen_write_unit_num_cells = + rc::gen::inRange(1, num_tiles_total * num_cells_per_tile); + const uint64_t write_unit_num_cells = *gen_write_unit_num_cells; + + instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + dimensions, + subarray, + write_unit_num_cells); +} + +TEST_CASE( + "C++ API: Max fragment size dense array rapidcheck 1d", + "[cppapi][max-frag-size][rapidcheck]") { + static constexpr auto DT = sm::Datatype::UINT64; + using Dim64 = templates::Dimension
; + using Dom64 = Dim64::domain_type; + + VFSTestSetup vfs; + Context ctx(vfs.ctx()); + + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order_rapidcheck_1d"); + + SECTION("Shrinking") { + instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_ROW_MAJOR, + 2396, + {Dim64(0, 8929, 594)}, + {Dom64(0, 2969)}); + } + + rc::prop("max fragment size dense 1d", [&]() { + Dim64 d1 = *rc::make_dimension
(8192); + + rapidcheck_dense_array
(ctx, array_name, {d1}); + }); +} + +TEST_CASE( + "C++ API: Max fragment size dense array rapidcheck 2d", + "[cppapi][max-frag-size][rapidcheck]") { + static constexpr auto DT = sm::Datatype::UINT64; + using Dim64 = templates::Dimension
; + using Dom64 = Dim64::domain_type; + + VFSTestSetup vfs; + Context ctx(vfs.ctx()); + + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order_rapidcheck_2d"); + + SECTION("Shrinking") { + SECTION("Example 1") { + instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_COL_MAJOR, + 48, + {Dim64(0, 116, 1), Dim64(0, 0, 1)}, + {Dom64(2, 20), Dom64(0, 0)}); + } + + SECTION("Example 2") { + instance_dense_global_order( + ctx, + array_name, + TILEDB_COL_MAJOR, + TILEDB_ROW_MAJOR, + 24, + {Dim64(0, 60, 1), Dim64(0, 20, 1)}, + {Dom64(0, 1), Dom64(0, 1)}); + } + + SECTION("Example 3") { + instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_ROW_MAJOR, + 48, + {Dim64(0, 35, 1), Dim64(0, 420, 1)}, + {Dom64(0, 1), Dom64(0, 4)}, + 1); + } + + SECTION("Example 4") { + /* + * In this example we end up with a fragment which fills all but one tile + * of a single row. The last tile in the row has to be its own fragment. + */ + auto fragments = instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_ROW_MAJOR, + 924, + {Dim64(0, 304, 8), Dim64(0, 147, 2)}, + {Dom64(0, 31), Dom64(0, 23)}, + 41); + } + } + + rc::prop("max fragment size dense 2d", [&]() { + Dim64 d1 = *rc::make_dimension
(128); + Dim64 d2 = *rc::make_dimension
(128); + + rapidcheck_dense_array
(ctx, array_name, {d1, d2}); + }); +} + +TEST_CASE( + "C++ API: Max fragment size dense array rapidcheck 3d", + "[cppapi][max-frag-size][rapidcheck]") { + static constexpr auto DT = sm::Datatype::UINT64; + using Dim64 = templates::Dimension
; + using Dom64 = Dim64::domain_type; + + VFSTestSetup vfs; + Context ctx(vfs.ctx()); + + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order_rapidcheck_3d"); + + SECTION("Shrinking") { + instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_ROW_MAJOR, + 2160, + {Dim64(0, 85, 5), Dim64(0, 102, 2), Dim64(0, 37, 1)}, + {Dom64(5, 19), Dom64(4, 15), Dom64(1, 6)}); + } + + rc::prop("max fragment size dense 3d", [&]() { + Dim64 d1 = *rc::make_dimension
(32); + Dim64 d2 = *rc::make_dimension
(32); + Dim64 d3 = *rc::make_dimension
(32); + + rapidcheck_dense_array
(ctx, array_name, {d1, d2, d3}); + }); +} + +/** + * Test some edge cases induced by variable-length tiles + */ +TEST_CASE( + "C++ API: Max fragment size dense array var size tiles", + "[cppapi][max-frag-size]") { + VFSTestSetup vfs; + Context ctx(vfs.ctx()); + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order_var"); + + using Dim64 = templates::Dimension; + using Dom64 = Dim64::domain_type; + + using F = templates::Fragment, std::tuple>>; + + const tiledb_layout_t tile_order = TILEDB_ROW_MAJOR; + const tiledb_layout_t cell_order = TILEDB_ROW_MAJOR; + + SECTION("Rectangle tiles") { + const uint64_t d1_extent = 8; + const uint64_t d2_span = 10000; + REQUIRE(d2_span % d1_extent == 0); + + const uint64_t d2_extent = d2_span / d1_extent; + + const Dim64 row(0, std::numeric_limits::max() - 1, d1_extent); + const Dim64 col(0, d2_span - 1, d2_extent); + + const Dom64 subrow(0, 2 * d1_extent - 1); + const Dom64 subcol = col.domain; + + auto make_subcol = [&](uint64_t start_tile, uint64_t end_tile) -> Dom64 { + const uint64_t tile_span = d2_extent * d1_extent / 8; + return Dom64(tile_span * start_tile, tile_span * end_tile - 1); + }; + + const Dom64 subrow_0(0, d1_extent - 1); + const Dom64 subrow_1(d1_extent, 2 * d1_extent - 1); + + const std::optional num_cells = + subarray_num_cells(std::vector{subrow, subcol}); + REQUIRE(num_cells.has_value()); + + const uint64_t approx_tiles_per_fragment = GENERATE(4, 9); + const uint64_t max_fragment_size = approx_tiles_per_fragment * 64 * 1024; + + F attributes; + attributes.reserve(num_cells.value()); + + const std::optional write_unit_num_cells = GENERATE_COPY( + std::optional{}, + 64, + 1024, + 1024 * 1024, + num_cells.value() - 1); + + const uint64_t num_cells_per_tile = d1_extent * d2_extent; + + DYNAMIC_SECTION( + "approx_tiles_per_fragment = " << approx_tiles_per_fragment) { + DYNAMIC_SECTION( + "write_unit_num_cells = " + << (write_unit_num_cells.has_value() ? + std::to_string(write_unit_num_cells.value()) : + "unlimited")) { + SECTION("Even") { + for (uint64_t c = 0; c < num_cells.value(); c++) { + const std::string str = std::to_string(c); + std::get<0>(attributes.attributes()) + .push_back(std::span(str.begin(), str.end())); + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {row, col}, + {subrow, subcol}, + attributes, + write_unit_num_cells); + + std::vector> expect; + if (approx_tiles_per_fragment == 4) { + expect.push_back({subrow_0, make_subcol(0, 4)}); + expect.push_back({subrow_0, make_subcol(4, 8)}); + expect.push_back({subrow_1, make_subcol(0, 3)}); + expect.push_back({subrow_1, make_subcol(3, 6)}); + expect.push_back({subrow_1, make_subcol(6, 8)}); + } else { + expect.push_back({subrow_0, subcol}); + expect.push_back({subrow_1, subcol}); + } + CHECK(expect == actual); + } + + SECTION("Skew first tile") { + // inflate all the records of the first tile + for (uint64_t c = 0; c < num_cells.value(); c++) { + const std::string str = + (c < num_cells_per_tile ? "foobargubquux" + std::to_string(c) : + std::to_string(c)); + std::get<0>(attributes.attributes()) + .push_back(std::span(str.begin(), str.end())); + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {row, col}, + {subrow, subcol}, + attributes, + write_unit_num_cells); + + std::vector> expect; + if (approx_tiles_per_fragment == 4) { + expect.push_back({subrow_0, make_subcol(0, 2)}); + expect.push_back({subrow_0, make_subcol(2, 6)}); + expect.push_back({subrow_0, make_subcol(6, 8)}); + expect.push_back({subrow_1, make_subcol(0, 3)}); + expect.push_back({subrow_1, make_subcol(3, 6)}); + expect.push_back({subrow_1, make_subcol(6, 8)}); + } else { + expect.push_back({subrow_0, make_subcol(0, 7)}); + expect.push_back({subrow_0, make_subcol(7, 8)}); + expect.push_back({subrow_1, subcol}); + } + CHECK(expect == actual); + } + + SECTION("Skew second tile") { + // inflate all the records of the second tile + for (uint64_t c = 0; c < num_cells.value(); c++) { + const std::string str = + (num_cells_per_tile <= c && c < 2 * num_cells_per_tile ? + "foobargubquux" + std::to_string(c) : + std::to_string(c)); + std::get<0>(attributes.attributes()) + .push_back(std::span(str.begin(), str.end())); + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {row, col}, + {subrow, subcol}, + attributes, + write_unit_num_cells); + + std::vector> expect; + if (approx_tiles_per_fragment == 4) { + expect.push_back({subrow_0, make_subcol(0, 2)}); + expect.push_back({subrow_0, make_subcol(2, 6)}); + expect.push_back({subrow_0, make_subcol(6, 8)}); + expect.push_back({subrow_1, make_subcol(0, 3)}); + expect.push_back({subrow_1, make_subcol(3, 6)}); + expect.push_back({subrow_1, make_subcol(6, 8)}); + } else { + expect.push_back({subrow_0, make_subcol(0, 7)}); + expect.push_back({subrow_0, make_subcol(7, 8)}); + expect.push_back({subrow_1, subcol}); + } + CHECK(expect == actual); + } + + SECTION("Skew last tile") { + // inflate all the records of the last tile + for (uint64_t c = 0; c < num_cells.value(); c++) { + const std::string str = + (num_cells.value() - num_cells_per_tile <= c ? + "foobargubquux" + std::to_string(c) : + std::to_string(c)); + std::get<0>(attributes.attributes()) + .push_back(std::span(str.begin(), str.end())); + } + + const auto actual = instance_dense_global_order( + ctx, + array_name, + tile_order, + cell_order, + max_fragment_size, + {row, col}, + {subrow, subcol}, + attributes, + write_unit_num_cells); + + std::vector> expect; + if (approx_tiles_per_fragment == 4) { + expect.push_back({subrow_0, make_subcol(0, 4)}); + expect.push_back({subrow_0, make_subcol(4, 8)}); + expect.push_back({subrow_1, make_subcol(0, 3)}); + expect.push_back({subrow_1, make_subcol(3, 6)}); + expect.push_back({subrow_1, make_subcol(6, 7)}); + expect.push_back({subrow_1, make_subcol(7, 8)}); + } else { + expect.push_back({subrow_0, subcol}); + expect.push_back({subrow_1, make_subcol(0, 7)}); + expect.push_back({subrow_1, make_subcol(7, 8)}); + } + CHECK(expect == actual); + } + } + } + } +} + +TEST_CASE( + "C++ API: Max fragment size dense unsupported on REST", "[cppapi][rest]") { + VFSTestSetup vfs; + if (!vfs.is_rest()) { + SKIP("Test is only applicable to REST client"); + } + + const std::string array_name = + vfs.array_uri("max_fragment_size_dense_global_order_rest_support"); + + Context ctx(vfs.ctx()); + + using Dim = templates::Dimension; + using Dom = Dim::domain_type; + + Dim d1(0, 0, 1); + Dim d2(0, 0, 1); + Dom s1(0, 0); + Dom s2(0, 0); + const uint64_t max_fragment_size = 24; + + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is not supported for remote global order writes to dense " + "arrays."); + + REQUIRE_THROWS( + instance_dense_global_order( + ctx, + array_name, + TILEDB_ROW_MAJOR, + TILEDB_ROW_MAJOR, + max_fragment_size, + {d1, d2}, + {s1, s2}), + expect); +} diff --git a/test/src/unit-sparse-global-order-reader.cc b/test/src/unit-sparse-global-order-reader.cc index 8c5c75d730a..8d73eec6e10 100644 --- a/test/src/unit-sparse-global-order-reader.cc +++ b/test/src/unit-sparse-global-order-reader.cc @@ -3598,19 +3598,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(cursor_cells + num_cells <= expect.size()); // accumulate - std::apply( - [&](auto&... field) { - std::apply( - [&](auto&... field_cursor) { - std::apply( - [&](const auto&... field_size) { - (field.accumulate_cursor(field_cursor, field_size), ...); - }, - field_sizes); - }, - outcursor); - }, - std::tuple_cat(outdims, outatts)); + templates::query::accumulate_cursor(out, outcursor, field_sizes); if (status == TILEDB_COMPLETED) { break; @@ -3620,15 +3608,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { // Clean up. tiledb_query_free(&query); - std::apply( - [outcursor](auto&... outfield) { - std::apply( - [&](const auto&... field_cursor) { - (outfield.finish_multipart_read(field_cursor), ...); - }, - outcursor); - }, - std::tuple_cat(outdims, outatts)); + templates::query::resize(out, outcursor); ASSERTER(expect.dimensions() == outdims); diff --git a/test/support/CMakeLists.txt b/test/support/CMakeLists.txt index 6eb891a6dba..caae00aa1a9 100644 --- a/test/support/CMakeLists.txt +++ b/test/support/CMakeLists.txt @@ -36,7 +36,8 @@ list(APPEND TILEDB_CORE_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/tiledb/sm/c_api") # Gather the test source files set(TILEDB_TEST_SUPPORT_SOURCES - rapidcheck/show.cc + rapidcheck/show/array_schema_templates.cc + rapidcheck/show/query_ast.cc src/array_helpers.cc src/array_schema_helpers.cc src/ast_helpers.h diff --git a/test/support/rapidcheck/array_schema_templates.h b/test/support/rapidcheck/array_schema_templates.h new file mode 100644 index 00000000000..642a25d5b6f --- /dev/null +++ b/test/support/rapidcheck/array_schema_templates.h @@ -0,0 +1,198 @@ +/** + * @file test/support/rapidcheck/array_schema_templates.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines rapidcheck generators for the structures + * defined in test/support/src/array_schema_templates.h. + */ + +#ifndef TILEDB_RAPIDCHECK_ARRAY_SCHEMA_H +#define TILEDB_RAPIDCHECK_ARRAY_SCHEMA_H + +#include +#include +#include + +#include "tiledb/common/arithmetic.h" + +namespace rc { + +using namespace tiledb::test; +using namespace tiledb::test::templates; + +template +Gen> make_domain(std::optional bound = std::nullopt) { + auto bounds = gen::mapcat(gen::arbitrary(), [bound](D lb) { + const D ub_limit = + (bound.has_value() ? + tiledb::common::checked_arithmetic::add(lb, bound.value()) + .value_or(std::numeric_limits::max()) : + std::numeric_limits::max()); + if constexpr (std::is_same_v || std::is_same_v) { + return gen::pair(gen::just(lb), gen::inRange(lb, ub_limit)); + } else { + // NB: `gen::inRange` is exclusive at the upper end but tiledb domain is + // inclusive. So we have to use `int64_t` to avoid overflow. + return gen::pair( + gen::just(lb), + gen::cast(gen::inRange(int64_t(lb), int64_t(ub_limit) + 1))); + } + }); + + return gen::map(bounds, [](std::pair bounds) { + return templates::Domain(bounds.first, bounds.second); + }); +} + +template +struct Arbitrary> { + static Gen> arbitrary() { + return make_domain(); + } +}; + +template +Gen make_extent( + const templates::Domain& domain, std::optional bound = std::nullopt) { + // upper bound on all possible extents to avoid unreasonably + // huge tile sizes + static constexpr D extent_limit = static_cast( + std::is_signed::value ? + std::min( + static_cast(std::numeric_limits::max()), + static_cast(1024 * 16)) : + std::min( + static_cast(std::numeric_limits::max()), + static_cast(1024 * 16))); + + const D extent_bound = + (bound.has_value() ? std::min(bound.value(), extent_limit) : + extent_limit); + + // NB: `gen::inRange` is exclusive at the upper end but tiledb domain is + // inclusive. So we have to be careful to avoid overflow. + + D extent_lower_bound = 1; + D extent_upper_bound; + + const auto bound_distance = tiledb::common::checked_arithmetic::sub( + domain.upper_bound, domain.lower_bound); + if (bound_distance.has_value()) { + extent_upper_bound = + (bound_distance.value() < extent_bound ? bound_distance.value() + 1 : + extent_bound); + } else { + extent_upper_bound = extent_bound; + } + + return gen::inRange(extent_lower_bound, extent_upper_bound + 1); +} + +template +Gen> make_dimension( + std::optional::value_type> extent_bound = + std::nullopt, + std::optional::value_type> domain_bound = + std::nullopt) { + using CoordType = templates::Dimension::value_type; + auto tup = gen::mapcat( + make_domain(domain_bound), + [extent_bound](Domain domain) { + return gen::pair(gen::just(domain), make_extent(domain, extent_bound)); + }); + + return gen::map(tup, [](std::pair, CoordType> tup) { + return templates::Dimension(tup.first, tup.second); + }); +} + +template +struct Arbitrary> { + static Gen> arbitrary() { + return make_dimension(); + } +}; + +template +Gen make_coordinate(const templates::Domain& domain) { + // `gen::inRange` does an exclusive upper bound, + // whereas the domain upper bound is inclusive. + // As a result some contortion is required to deal + // with numeric_limits. + if constexpr (std::is_same_v) { + // NB: poor performance with small domains for sure + return gen::suchThat( + gen::map( + gen::string(), + [](std::string s) { + StringDimensionCoordType v(s.begin(), s.end()); + return v; + }), + [domain](const StringDimensionCoordType& s) { + return domain.lower_bound <= s && s <= domain.upper_bound; + }); + } else if constexpr (std::is_signed::value) { + if (int64_t(domain.upper_bound) < std::numeric_limits::max()) { + return gen::cast(gen::inRange( + int64_t(domain.lower_bound), int64_t(domain.upper_bound + 1))); + } else { + return gen::inRange(domain.lower_bound, domain.upper_bound); + } + } else { + if (uint64_t(domain.upper_bound) < std::numeric_limits::max()) { + return gen::cast(gen::inRange( + uint64_t(domain.lower_bound), uint64_t(domain.upper_bound + 1))); + } else { + return gen::inRange(domain.lower_bound, domain.upper_bound); + } + } +} + +template +Gen> make_range(const templates::Domain& domain) { + return gen::apply( + [](D p1, D p2) { return templates::Domain(p1, p2); }, + make_coordinate(domain), + make_coordinate(domain)); +} + +template <> +void show>(const templates::Domain& domain, std::ostream& os); + +template <> +void show>( + const templates::Domain& domain, std::ostream& os); + +template <> +void show>( + const templates::Dimension& dimension, + std::ostream& os); + +} // namespace rc + +#endif diff --git a/test/support/rapidcheck/array_templates.h b/test/support/rapidcheck/array_templates.h index f2c1dacc232..37762a9ba6b 100644 --- a/test/support/rapidcheck/array_templates.h +++ b/test/support/rapidcheck/array_templates.h @@ -34,6 +34,7 @@ #ifndef TILEDB_RAPIDCHECK_ARRAY_H #define TILEDB_RAPIDCHECK_ARRAY_H +#include #include #include #include @@ -43,139 +44,6 @@ namespace rc { using namespace tiledb::test; using namespace tiledb::test::templates; -template -struct Arbitrary> { - static Gen> arbitrary() { - // NB: `gen::inRange` is exclusive at the upper end but tiledb domain is - // inclusive. So we have to use `int64_t` to avoid overflow. - auto bounds = gen::mapcat(gen::arbitrary(), [](D lb) { - if constexpr (std::is_same::value) { - return gen::pair( - gen::just(lb), gen::inRange(lb, std::numeric_limits::max())); - } else if constexpr (std::is_same::value) { - return gen::pair( - gen::just(lb), gen::inRange(lb, std::numeric_limits::max())); - } else { - auto ub_limit = int64_t(std::numeric_limits::max()) + 1; - return gen::pair( - gen::just(lb), gen::cast(gen::inRange(int64_t(lb), ub_limit))); - } - }); - - return gen::map(bounds, [](std::pair bounds) { - return templates::Domain(bounds.first, bounds.second); - }); - } -}; - -/** - * @return `a - b` if it does not overflow, `std::nullopt` if it does - */ -template -std::optional checked_sub(T a, T b) { - if (!std::is_signed::value) { - return (b > a ? std::nullopt : std::optional(a - b)); - } else if (b < 0) { - return ( - std::numeric_limits::max() + b < a ? std::nullopt : - std::optional(a - b)); - } else { - return ( - std::numeric_limits::min() - b > a ? std::nullopt : - std::optional(a - b)); - } -} - -template -Gen make_extent(const templates::Domain& domain) { - // upper bound on all possible extents to avoid unreasonably - // huge tile sizes - static constexpr D extent_limit = static_cast( - std::is_signed::value ? - std::min( - static_cast(std::numeric_limits::max()), - static_cast(1024 * 16)) : - std::min( - static_cast(std::numeric_limits::max()), - static_cast(1024 * 16))); - - // NB: `gen::inRange` is exclusive at the upper end but tiledb domain is - // inclusive. So we have to be careful to avoid overflow. - - D extent_lower_bound = 1; - D extent_upper_bound; - - const auto bound_distance = - checked_sub(domain.upper_bound, domain.lower_bound); - if (bound_distance.has_value()) { - extent_upper_bound = - (bound_distance.value() < extent_limit ? bound_distance.value() + 1 : - extent_limit); - } else { - extent_upper_bound = extent_limit; - } - - return gen::inRange(extent_lower_bound, extent_upper_bound + 1); -} - -template -struct Arbitrary> { - static Gen> arbitrary() { - using CoordType = templates::Dimension::value_type; - auto tup = gen::mapcat( - gen::arbitrary>(), [](Domain domain) { - return gen::pair(gen::just(domain), make_extent(domain)); - }); - - return gen::map(tup, [](std::pair, CoordType> tup) { - return templates::Dimension(tup.first, tup.second); - }); - } -}; - -template -Gen make_coordinate(const templates::Domain& domain) { - // `gen::inRange` does an exclusive upper bound, - // whereas the domain upper bound is inclusive. - // As a result some contortion is required to deal - // with numeric_limits. - if constexpr (std::is_same_v) { - // NB: poor performance with small domains for sure - return gen::suchThat( - gen::map( - gen::string(), - [](std::string s) { - StringDimensionCoordType v(s.begin(), s.end()); - return v; - }), - [domain](const StringDimensionCoordType& s) { - return domain.lower_bound <= s && s <= domain.upper_bound; - }); - } else if constexpr (std::is_signed::value) { - if (int64_t(domain.upper_bound) < std::numeric_limits::max()) { - return gen::cast(gen::inRange( - int64_t(domain.lower_bound), int64_t(domain.upper_bound + 1))); - } else { - return gen::inRange(domain.lower_bound, domain.upper_bound); - } - } else { - if (uint64_t(domain.upper_bound) < std::numeric_limits::max()) { - return gen::cast(gen::inRange( - uint64_t(domain.lower_bound), uint64_t(domain.upper_bound + 1))); - } else { - return gen::inRange(domain.lower_bound, domain.upper_bound); - } - } -} - -template -Gen> make_range(const templates::Domain& domain) { - return gen::apply( - [](D p1, D p2) { return templates::Domain(p1, p2); }, - make_coordinate(domain), - make_coordinate(domain)); -} - template Gen> make_fragment_1d( bool allow_duplicates, const Domain& d) { @@ -307,10 +175,6 @@ Gen> make_fragment_3d( }); } -void showValue(const templates::Domain& domain, std::ostream& os); -void showValue(const templates::Domain& domain, std::ostream& os); -void showValue(const templates::Domain& domain, std::ostream& os); - namespace detail { /** diff --git a/test/support/rapidcheck/show/array_schema_templates.cc b/test/support/rapidcheck/show/array_schema_templates.cc new file mode 100644 index 00000000000..ca395c902f9 --- /dev/null +++ b/test/support/rapidcheck/show/array_schema_templates.cc @@ -0,0 +1,74 @@ +/** + * @file test/support/rapidcheck/show/array_schema_templates.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file provides forward declarations of `rc::detail::showValue` + * overloads, which seemingly must be included prior to the rapidcheck + * header files. + */ + +#include +#include +#include + +namespace rc { + +template +void showImpl( + const tiledb::test::templates::Domain& domain, std::ostream& os) { + os << "[" << domain.lower_bound << ", " << domain.upper_bound << "]"; +} + +template <> +void show>( + const tiledb::test::templates::Domain& domain, std::ostream& os) { + showImpl(domain, os); +} + +template <> +void show>( + const tiledb::test::templates::Domain& domain, std::ostream& os) { + showImpl(domain, os); +} + +template +void showImpl( + const tiledb::test::templates::Dimension
& dimension, std::ostream& os) { + os << "{\"domain\": "; + showImpl(dimension.domain, os); + os << ", \"extent\": " << dimension.extent << "}"; +} + +template <> +void show>( + const templates::Dimension& dimension, + std::ostream& os) { + showImpl(dimension, os); +} + +} // namespace rc diff --git a/test/support/rapidcheck/show.cc b/test/support/rapidcheck/show/query_ast.cc similarity index 93% rename from test/support/rapidcheck/show.cc rename to test/support/rapidcheck/show/query_ast.cc index f3aeb2426db..f895667de4c 100644 --- a/test/support/rapidcheck/show.cc +++ b/test/support/rapidcheck/show/query_ast.cc @@ -1,5 +1,5 @@ /** - * @file test/support/rapidcheck/show.cc + * @file test/support/rapidcheck/show/query_ast.cc * * @section LICENSE * @@ -32,8 +32,11 @@ * header files. */ +#include #include +#include +#include "test/support/src/array_templates.h" #include "tiledb/sm/enums/query_condition_op.h" #include "tiledb/sm/query/ast/query_ast.h" diff --git a/test/support/src/array_schema_templates.h b/test/support/src/array_schema_templates.h new file mode 100644 index 00000000000..bd2b77059a8 --- /dev/null +++ b/test/support/src/array_schema_templates.h @@ -0,0 +1,219 @@ +/** + * @file test/support/src/array_schema_templates.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file provides templates for generic programming with respect + * to array schema, data types, etc. + */ + +#ifndef TILEDB_ARRAY_SCHEMA_TEMPLATES_H +#define TILEDB_ARRAY_SCHEMA_TEMPLATES_H + +#include "tiledb/type/datatype_traits.h" +#include "tiledb/type/range/range.h" + +#include +#include + +namespace tiledb::test::templates { + +using StringDimensionCoordType = std::vector; +using StringDimensionCoordView = std::span; + +/** + * Constrains types which can be used as the physical type of a dimension. + */ +template +concept DimensionType = + std::is_same_v or requires(const D& coord) { + typename std::is_signed; + { coord < coord } -> std::same_as; + { D(int64_t(coord)) } -> std::same_as; + }; + +/** + * Constrains types which can be used as the physical type of an attribute. + * + * Right now this doesn't constrain anything, it is just a marker for + * readability, and someday we might want it do require something. + * + * This used to have + * ``` + * typename query_buffers::cell_type; + * ``` + * but that was removed to simplify include whatnot and forward declaration etc + */ +template +concept AttributeType = true; + +/** + * A generic, statically-typed range which is inclusive on both ends. + */ +template +struct Domain { + D lower_bound; + D upper_bound; + + Domain() { + } + + Domain(D d1, D d2) + : lower_bound(std::min(d1, d2)) + , upper_bound(std::max(d1, d2)) { + } + + bool operator==(const Domain&) const = default; + + uint64_t num_cells() const { + // FIXME: this is incorrect for 64-bit domains which need to check overflow + if (std::is_signed::value) { + return static_cast(upper_bound) - + static_cast(lower_bound) + 1; + } else { + return static_cast(upper_bound) - + static_cast(lower_bound) + 1; + } + } + + bool contains(D point) const { + return lower_bound <= point && point <= upper_bound; + } + + bool intersects(const Domain& other) const { + return (other.lower_bound <= lower_bound && + lower_bound <= other.upper_bound) || + (other.lower_bound <= upper_bound && + upper_bound <= other.upper_bound) || + (lower_bound <= other.lower_bound && + other.lower_bound <= upper_bound) || + (lower_bound <= other.upper_bound && + other.upper_bound <= upper_bound); + } + + tiledb::type::Range range() const { + return tiledb::type::Range(lower_bound, upper_bound); + } +}; + +/** + * A description of a dimension as it pertains to its datatype. + */ +template +struct Dimension { + using value_type = tiledb::type::datatype_traits
::value_type; + using domain_type = Domain; + + static constexpr tiledb::sm::Datatype DATATYPE = DT; + + Dimension() = default; + Dimension(Domain domain, value_type extent) + : domain(domain) + , extent(extent) { + } + + Dimension(value_type lower_bound, value_type upper_bound, value_type extent) + : Dimension(Domain(lower_bound, upper_bound), extent) { + } + + Domain domain; + value_type extent; + + /** + * @return the number of tiles spanned by the whole domain of this dimension + */ + uint64_t num_tiles() const { + return num_tiles(domain); + } + + /** + * @return the number of tiles spanned by a range in this dimension + */ + uint64_t num_tiles(const domain_type& range) const { + return (range.num_cells() + extent - 1) / extent; + } +}; + +template +struct static_attribute {}; + +template +struct static_attribute { + static constexpr tiledb::sm::Datatype datatype = DATATYPE; + static constexpr uint32_t cell_val_num = 1; + static constexpr bool nullable = false; + + using value_type = + typename tiledb::type::datatype_traits::value_type; + using cell_type = value_type; +}; + +template +struct static_attribute { + static constexpr tiledb::sm::Datatype datatype = DATATYPE; + static constexpr uint32_t cell_val_num = 1; + static constexpr bool nullable = true; + + using value_type = std::optional< + typename tiledb::type::datatype_traits::value_type>; + using cell_type = value_type; +}; + +template +struct static_attribute { + static constexpr tiledb::sm::Datatype datatype = DATATYPE; + static constexpr uint32_t cell_val_num = tiledb::sm::cell_val_num_var; + static constexpr bool nullable = false; + + using value_type = + typename tiledb::type::datatype_traits::value_type; + using cell_type = std::vector; +}; + +template +struct static_attribute { + static constexpr tiledb::sm::Datatype datatype = DATATYPE; + static constexpr uint32_t cell_val_num = tiledb::sm::cell_val_num_var; + static constexpr bool nullable = true; + + using value_type = + typename tiledb::type::datatype_traits::value_type; + using cell_type = std::optional>; +}; + +template +constexpr std::tuple +attribute_properties() { + return { + static_attribute::datatype, + static_attribute::cell_val_num, + static_attribute::nullable}; +} + +} // namespace tiledb::test::templates + +#endif diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 6e6d4f26da0..9dc422bc48a 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -42,6 +42,7 @@ #include "tiledb/type/range/range.h" #include +#include #include #include #include @@ -59,9 +60,6 @@ class Dimension; namespace tiledb::test::templates { -using StringDimensionCoordType = std::vector; -using StringDimensionCoordView = std::span; - /** * Adapts a `std::tuple` whose fields are all `GlobalCellCmp` * to itself be `GlobalCellCmp`. @@ -113,6 +111,27 @@ struct global_cell_cmp_std_tuple { StdTuple tup_; }; +/** + * Adapts a span of coordinates for comparison using `GlobalCellCmp`. + */ +template +struct global_cell_cmp_span { + global_cell_cmp_span(std::span values) + : values_(values) { + } + + tiledb::common::UntypedDatumView dimension_datum( + const tiledb::sm::Dimension&, unsigned dim_idx) const { + return UntypedDatumView(&values_[dim_idx], sizeof(Coord)); + } + + const void* coord(unsigned dim) const { + return &values_[dim]; + } + + std::span values_; +}; + /** * Forward declaration of query_buffers * which will be specialized. @@ -123,26 +142,6 @@ struct global_cell_cmp_std_tuple { template struct query_buffers {}; -/** - * Constrains types which can be used as the physical type of a dimension. - */ -template -concept DimensionType = - std::is_same_v or requires(const D& coord) { - typename std::is_signed; - { coord < coord } -> std::same_as; - { D(int64_t(coord)) } -> std::same_as; - }; - -/** - * Constrains types which can be used as the physical type of an attribute. - * - * Right now this doesn't constrain anything, it is just a marker for - * readability, and someday we might want it do require something. - */ -template -concept AttributeType = requires(T) { typename query_buffers::cell_type; }; - /** * Constrains types which can be used as columnar data fragment input. * @@ -165,139 +164,7 @@ concept FragmentType = requires(const T& fragment) { }; /** - * A generic, statically-typed range which is inclusive on both ends. - */ -template -struct Domain { - D lower_bound; - D upper_bound; - - Domain() { - } - - Domain(D d1, D d2) - : lower_bound(std::min(d1, d2)) - , upper_bound(std::max(d1, d2)) { - } - - uint64_t num_cells() const { - // FIXME: this is incorrect for 64-bit domains which need to check overflow - if (std::is_signed::value) { - return static_cast(upper_bound) - - static_cast(lower_bound) + 1; - } else { - return static_cast(upper_bound) - - static_cast(lower_bound) + 1; - } - } - - bool contains(D point) const { - return lower_bound <= point && point <= upper_bound; - } - - bool intersects(const Domain& other) const { - return (other.lower_bound <= lower_bound && - lower_bound <= other.upper_bound) || - (other.lower_bound <= upper_bound && - upper_bound <= other.upper_bound) || - (lower_bound <= other.lower_bound && - other.lower_bound <= upper_bound) || - (lower_bound <= other.upper_bound && - other.upper_bound <= upper_bound); - } - - tiledb::type::Range range() const { - return tiledb::type::Range(lower_bound, upper_bound); - } -}; - -/** - * A description of a dimension as it pertains to its datatype. - */ -template -struct Dimension { - using value_type = tiledb::type::datatype_traits::value_type; - - Dimension() = default; - Dimension(Domain domain, value_type extent) - : domain(domain) - , extent(extent) { - } - - Domain domain; - value_type extent; -}; - -template <> -struct Dimension { - using value_type = StringDimensionCoordType; - - Dimension() { - } - - Dimension(const Domain& domain) - : domain(domain) { - } - - std::optional> domain; -}; - -template -struct static_attribute {}; - -template -struct static_attribute { - static constexpr Datatype datatype = DATATYPE; - static constexpr uint32_t cell_val_num = 1; - static constexpr bool nullable = false; - - using value_type = - typename tiledb::type::datatype_traits::value_type; - using cell_type = value_type; -}; - -template -struct static_attribute { - static constexpr Datatype datatype = DATATYPE; - static constexpr uint32_t cell_val_num = 1; - static constexpr bool nullable = true; - - using value_type = std::optional< - typename tiledb::type::datatype_traits::value_type>; - using cell_type = value_type; -}; - -template -struct static_attribute { - static constexpr Datatype datatype = DATATYPE; - static constexpr uint32_t cell_val_num = tiledb::sm::cell_val_num_var; - static constexpr bool nullable = false; - - using value_type = - typename tiledb::type::datatype_traits::value_type; - using cell_type = std::vector; -}; - -template -struct static_attribute { - static constexpr Datatype datatype = DATATYPE; - static constexpr uint32_t cell_val_num = tiledb::sm::cell_val_num_var; - static constexpr bool nullable = true; - - using value_type = - typename tiledb::type::datatype_traits::value_type; - using cell_type = std::optional>; -}; - -template -constexpr std::tuple attribute_properties() { - return { - static_attribute::datatype, - static_attribute::cell_val_num, - static_attribute::nullable}; -} - -/** +2D) * Schema of named fields for simple evaluation of a query condition */ template @@ -413,7 +280,7 @@ struct QueryConditionEvalSchema { */ bool test( const Fragment& fragment, - int record, + uint64_t record, const tiledb::sm::ASTNode& condition) const { using DimensionTuple = stdx::decay_tuple; using AttributeTuple = stdx::decay_tuple; @@ -526,8 +393,13 @@ struct query_buffers { return *this; } + query_field_size_type make_field_size( + uint64_t offset, uint64_t cell_limit) const { + return sizeof(T) * std::min(cell_limit, values_.size() - offset); + } + query_field_size_type make_field_size(uint64_t cell_limit) const { - return sizeof(T) * std::min(cell_limit, values_.size()); + return make_field_size(0, cell_limit); } int32_t attach_to_query( @@ -559,11 +431,12 @@ struct query_buffers { } void accumulate_cursor( - query_field_size_type& cursor, const query_field_size_type& field_sizes) { + query_field_size_type& cursor, + const query_field_size_type& field_sizes) const { cursor += field_sizes; } - void finish_multipart_read(const query_field_size_type& cursor) { + void resize_to_cursor(const query_field_size_type& cursor) { resize(cursor / sizeof(T)); } @@ -575,6 +448,12 @@ struct query_buffers { values_.insert(std::forward(args)...); } + self_type slice(uint64_t cell_start, uint64_t num_cells) const { + return self_type(std::vector( + values_.begin() + cell_start, + values_.begin() + cell_start + num_cells)); + } + auto begin() { return values_.begin(); } @@ -759,20 +638,35 @@ struct query_buffers> { validity_.end(), from.validity_.begin(), from.validity_.end()); } + self_type slice(uint64_t cell_start, uint64_t num_cells) const { + self_type ret; + ret.values_ = std::vector( + values_.begin() + cell_start, values_.begin() + num_cells); + ret.validity_ = std::vector( + validity_.begin() + cell_start, validity_.begin() + num_cells); + return ret; + } + self_type& operator=(const self_type& other) { values_ = other.values_; validity_ = other.validity_; return *this; } - query_field_size_type make_field_size(uint64_t cell_limit) const { + query_field_size_type make_field_size( + uint64_t offset, uint64_t cell_limit) const { const uint64_t values_size = - sizeof(T) * std::min(cell_limit, values_.size()); + sizeof(T) * std::min(cell_limit, values_.size() - offset); const uint64_t validity_size = - sizeof(uint8_t) * std::min(cell_limit, validity_.size()); + sizeof(uint8_t) * + std::min(cell_limit, validity_.size() - offset); return std::make_pair(values_size, validity_size); } + query_field_size_type make_field_size(uint64_t cell_limit) const { + return make_field_size(0, cell_limit); + } + int32_t attach_to_query( tiledb_ctx_t* ctx, tiledb_query_t* query, @@ -814,12 +708,13 @@ struct query_buffers> { } void accumulate_cursor( - query_field_size_type& cursor, const query_field_size_type& field_sizes) { + query_field_size_type& cursor, + const query_field_size_type& field_sizes) const { std::get<0>(cursor) += std::get<0>(field_sizes); std::get<1>(cursor) += std::get<1>(field_sizes); } - void finish_multipart_read(const query_field_size_type& cursor) { + void resize_to_cursor(const query_field_size_type& cursor) { resize(std::get<0>(cursor) / sizeof(T)); } }; @@ -912,19 +807,61 @@ struct query_buffers> { values_.insert(values_.end(), from.values_.begin(), from.values_.end()); } + self_type slice(uint64_t cell_start, uint64_t num_cells) const { + std::vector slice_offsets( + offsets_.begin() + cell_start, + offsets_.begin() + cell_start + num_cells); + std::vector slice_values; + for (uint64_t o = cell_start; o < cell_start + num_cells; o++) { + const uint64_t end = + (o + 1 == offsets_.size() ? values_.size() : offsets_[o + 1]); + slice_values.insert( + slice_values.end(), + values_.begin() + offsets_[o], + values_.begin() + end); + } + + const uint64_t offset_adjustment = slice_offsets[0]; + for (uint64_t& offset : slice_offsets) { + offset -= offset_adjustment; + } + + self_type ret; + ret.offsets_ = slice_offsets; + ret.values_ = slice_values; + return ret; + } + self_type& operator=(const self_type& other) { offsets_ = other.offsets_; values_ = other.values_; return *this; } - query_field_size_type make_field_size(uint64_t cell_limit) const { - const uint64_t values_size = sizeof(T) * values_.size(); + query_field_size_type make_field_size( + uint64_t cell_offset, uint64_t cell_limit) const { + const uint64_t num_cells = + std::min(cell_limit, offsets_.size() - cell_offset); + const uint64_t offsets_size = - sizeof(uint64_t) * std::min(cell_limit, offsets_.size()); + sizeof(uint64_t) * + std::min(num_cells, offsets_.size() - cell_offset); + + uint64_t values_size; + if (cell_offset + num_cells + 1 < offsets_.size()) { + values_size = sizeof(T) * + (offsets_[cell_offset + num_cells] - offsets_[cell_offset]); + } else { + values_size = sizeof(T) * (values_.size() - offsets_[cell_offset]); + } + return std::make_pair(values_size, offsets_size); } + query_field_size_type make_field_size(uint64_t cell_limit) const { + return make_field_size(0, cell_limit); + } + int32_t attach_to_query( tiledb_ctx_t* ctx, tiledb_query_t* query, @@ -974,12 +911,13 @@ struct query_buffers> { } void accumulate_cursor( - query_field_size_type& cursor, const query_field_size_type& field_sizes) { + query_field_size_type& cursor, + const query_field_size_type& field_sizes) const { std::get<0>(cursor) += std::get<0>(field_sizes); std::get<1>(cursor) += std::get<1>(field_sizes); } - void finish_multipart_read(const query_field_size_type& cursor) { + void resize_to_cursor(const query_field_size_type& cursor) { values_.resize(std::get<0>(cursor) / sizeof(T)); offsets_.resize(std::get<1>(cursor) / sizeof(uint64_t)); } @@ -1086,15 +1024,26 @@ struct query_buffers>> { return *this; } - query_field_size_type make_field_size(uint64_t cell_limit) const { - const uint64_t values_size = sizeof(T) * values_.size(); + query_field_size_type make_field_size( + uint64_t cell_offset, uint64_t cell_limit) const { const uint64_t offsets_size = - sizeof(uint64_t) * std::min(cell_limit, offsets_.size()); + sizeof(uint64_t) * + std::min(cell_limit, offsets_.size() - cell_offset); const uint64_t validity_size = - sizeof(uint8_t) * std::min(cell_limit, validity_.size()); + sizeof(uint8_t) * + std::min(cell_limit, validity_.size() - cell_offset); + + // NB: unlike the above this can just be the whole buffer + // since offsets is what determines the values + const uint64_t values_size = sizeof(T) * values_.size(); + return std::make_tuple(values_size, offsets_size, validity_size); } + query_field_size_type make_field_size(uint64_t cell_limit) const { + return make_field_size(0, cell_limit); + } + int32_t attach_to_query( tiledb_ctx_t* ctx, tiledb_query_t* query, @@ -1157,13 +1106,14 @@ struct query_buffers>> { } void accumulate_cursor( - query_field_size_type& cursor, const query_field_size_type& field_sizes) { + query_field_size_type& cursor, + const query_field_size_type& field_sizes) const { std::get<0>(cursor) += std::get<0>(field_sizes); std::get<1>(cursor) += std::get<1>(field_sizes); std::get<2>(cursor) += std::get<2>(field_sizes); } - void finish_multipart_read(const query_field_size_type& cursor) { + void resize_to_cursor(const query_field_size_type& cursor) { values_.resize(std::get<0>(cursor) / sizeof(T)); offsets_.resize(std::get<1>(cursor) / sizeof(uint64_t)); validity_.resize(std::get<2>(cursor) / sizeof(uint8_t)); @@ -1288,6 +1238,30 @@ struct Fragment { }, std::tuple_cat(dimensions(), attributes())); } + + /** + * @return a new fragment containing the cells in the range `[cell_start, + * cell_start + num_cells)` + */ + self_type slice(uint64_t cell_start, uint64_t num_cells) const { + const auto dims = std::apply( + [&](Ts&... dst) { + return std::make_tuple(dst.slice(cell_start, num_cells)...); + }, + dimensions()); + const auto atts = std::apply( + [&](Ts&... dst) { + return std::make_tuple(dst.slice(cell_start, num_cells)...); + }, + attributes()); + + return self_type{.dims_ = dims, .atts_ = atts}; + } + + bool operator==(const self_type& other) const { + return dimensions() == other.dimensions() && + attributes() == other.attributes(); + } }; /** @@ -1368,7 +1342,7 @@ struct query_applicator { * @return a tuple containing the size of each input field */ static auto make_field_sizes( - const std::tuple&...> fields, + const std::tuple&...> fields, uint64_t cell_limit = std::numeric_limits::max()) { std::optional num_cells; auto make_field_size = [&](const query_buffers& field) { @@ -1391,6 +1365,27 @@ struct query_applicator { fields); } + /** + * @return a tuple containing the size of each input field to write for a + * range of input cells [cell_offset, cell_offset + cell_limit] + */ + static auto write_make_field_sizes( + const std::tuple&...> fields, + uint64_t cell_offset, + uint64_t cell_limit = std::numeric_limits::max()) { + auto write_make_field_size = [&]( + const query_buffers& field) { + const auto field_size = field.make_field_size(cell_offset, cell_limit); + return field_size; + }; + + return std::apply( + [&](const auto&... field) { + return std::make_tuple(write_make_field_size(field)...); + }, + fields); + } + /** * Sets buffers on `query` for the variadic `fields` and `fields_sizes` */ @@ -1407,7 +1402,11 @@ struct query_applicator { const auto& field_cursor) { const auto rc = field.attach_to_query(ctx, query, field_size, name, field_cursor); - ASSERTER(std::optional() == error_if_any(ctx, rc)); + + // some versions of gcc have a false positive here for + // -Wmaybe-uninitialized, so do this instead of comparing against + // `std::optional` + ASSERTER("" == error_if_any(ctx, rc).value_or("")); }; unsigned d = 0; @@ -1471,9 +1470,10 @@ namespace query { */ template auto make_field_sizes( - F& fragment, uint64_t cell_limit = std::numeric_limits::max()) { - typename F::DimensionBuffersRef dims = fragment.dimensions(); - typename F::AttributeBuffersRef atts = fragment.attributes(); + const F& fragment, + uint64_t cell_limit = std::numeric_limits::max()) { + typename F::DimensionBuffersConstRef dims = fragment.dimensions(); + typename F::AttributeBuffersConstRef atts = fragment.attributes(); return [cell_limit](std::tuple fields) { return query_applicator::make_field_sizes( fields, cell_limit); @@ -1483,7 +1483,20 @@ auto make_field_sizes( template using fragment_field_sizes_t = decltype(make_field_sizes( - std::declval(), std::declval())); + std::declval(), std::declval())); + +template +fragment_field_sizes_t write_make_field_sizes( + const F& fragment, + uint64_t cell_offset, + uint64_t cell_limit = std::numeric_limits::max()) { + typename F::DimensionBuffersConstRef dims = fragment.dimensions(); + typename F::AttributeBuffersConstRef atts = fragment.attributes(); + return [cell_offset, cell_limit](std::tuple fields) { + return query_applicator...>:: + write_make_field_sizes(fields, cell_offset, cell_limit); + }(std::tuple_cat(dims, atts)); +} /** * Apply field cursor and sizes to each field of `fragment`. @@ -1510,6 +1523,46 @@ void apply_cursor( std::tuple_cat(dims, atts)); } +/** + * Advances field cursors `cursor` over `fragment` by the amount of data from + * `field_sizes` + */ +template +void accumulate_cursor( + const F& fragment, + fragment_field_sizes_t& cursor, + const fragment_field_sizes_t& field_sizes) { + std::apply( + [&](auto&... field) { + std::apply( + [&](auto&... field_cursor) { + std::apply( + [&](const auto&... field_size) { + (field.accumulate_cursor(field_cursor, field_size), ...); + }, + field_sizes); + }, + cursor); + }, + std::tuple_cat(fragment.dimensions(), fragment.attributes())); +} + +/** + * Resizes the fields of `fragment` to the sizes given by `cursor`. + */ +template +void resize(F& fragment, const fragment_field_sizes_t& cursor) { + std::apply( + [cursor](auto&... field) { + std::apply( + [&](const auto&... field_cursor) { + (field.resize_to_cursor(field_cursor), ...); + }, + cursor); + }, + std::tuple_cat(fragment.dimensions(), fragment.attributes())); +} + /** * Set buffers on `query` for the tuple of field columns */ @@ -1569,7 +1622,7 @@ uint64_t num_cells(const F& fragment, const auto& field_sizes) { } /** - * Writes a fragment to an array. + * Writes a fragment to a sparse array. */ template void write_fragment( @@ -1604,10 +1657,105 @@ void write_fragment( ASSERTER(num_cells == expect_num_cells); } +/** + * Writes a fragment to a dense array. + */ +template +void write_fragment( + const Fragment& fragment, + Array& forwrite, + const sm::NDRange& subarray, + tiledb_layout_t layout = TILEDB_ROW_MAJOR) { + Query query(forwrite.context(), forwrite, TILEDB_WRITE); + query.set_layout(layout); + + std::vector coords; + for (const auto& dim : subarray) { + coords.push_back(dim.start_as()); + coords.push_back(dim.end_as()); + } + + Subarray sub(query.ctx(), forwrite); + sub.set_subarray(coords); + query.set_subarray(sub); + + auto field_sizes = + make_field_sizes(const_cast(fragment)); + templates::query::set_fields( + query.ctx().ptr().get(), + query.ptr().get(), + field_sizes, + const_cast(fragment), + [](unsigned d) { return "d" + std::to_string(d + 1); }, + [](unsigned a) { return "a" + std::to_string(a + 1); }); + + const auto status = query.submit(); + ASSERTER(status == Query::Status::COMPLETE); + + if (layout == TILEDB_GLOBAL_ORDER) { + query.finalize(); + } + + // check that sizes match what we expect + const uint64_t expect_num_cells = fragment.size(); + const uint64_t num_cells = + templates::query::num_cells(fragment, field_sizes); + + ASSERTER(num_cells == expect_num_cells); +} + } // namespace query namespace ddl { +template +struct cell_type_traits; + +template <> +struct cell_type_traits { + static constexpr sm::Datatype physical_type = sm::Datatype::CHAR; + static constexpr uint32_t cell_val_num = 1; + static constexpr bool is_nullable = false; +}; + +template <> +struct cell_type_traits { + static constexpr sm::Datatype physical_type = sm::Datatype::INT32; + static constexpr uint32_t cell_val_num = 1; + static constexpr bool is_nullable = false; +}; + +template <> +struct cell_type_traits { + static constexpr sm::Datatype physical_type = sm::Datatype::UINT64; + static constexpr uint32_t cell_val_num = 1; + static constexpr bool is_nullable = false; +}; + +template +struct cell_type_traits> { + static constexpr sm::Datatype physical_type = + cell_type_traits::physical_type; + static constexpr uint32_t cell_val_num = std::numeric_limits::max(); + static constexpr bool is_nullable = false; +}; + +template +std::vector> physical_type_attributes() { + std::vector> ret; + auto attr = [&](const T&) { + ret.push_back(std::make_tuple( + cell_type_traits>::physical_type, + cell_type_traits>::cell_val_num, + cell_type_traits>::is_nullable)); + }; + std::apply( + [&](const auto&... value) { (attr(value), ...); }, + typename F::AttributeTuple()); + + return ret; +} + /** * Creates an array with a schema whose dimensions and attributes * come from the simplified arguments. diff --git a/test/support/src/fragment_info_helpers.h b/test/support/src/fragment_info_helpers.h new file mode 100644 index 00000000000..f7899161887 --- /dev/null +++ b/test/support/src/fragment_info_helpers.h @@ -0,0 +1,166 @@ +/** + * @file fragment_info_helpers.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file provides declarations and definitions of functionality which + * may be common to tests inspecting fragment info and fragment metadata. + */ + +#ifndef TILEDB_TEST_FRAGMENT_INFO_HELPERS_H +#define TILEDB_TEST_FRAGMENT_INFO_HELPERS_H + +#include +#include + +#include "tiledb/api/c_api/fragment_info/fragment_info_api_internal.h" +#include "tiledb/sm/cpp_api/context.h" +#include "tiledb/sm/cpp_api/fragment_info.h" +#include "tiledb/sm/enums/layout.h" +#include "tiledb/sm/fragment/single_fragment_info.h" +#include "tiledb/sm/misc/types.h" +#include "tiledb/sm/tile/test/arithmetic.h" + +#include +#include + +namespace tiledb::test { + +template +std::vector>> +collect_and_validate_fragment_domains( + const Context& ctx, + sm::Layout tile_order, + const std::string& array_name, + const std::span tile_extents, + const sm::NDRange& expect_domain, + uint64_t max_fragment_size) { + const uint64_t num_dimensions = expect_domain.size(); + + FragmentInfo finfo(ctx, array_name); + finfo.load(); + + // collect fragment domains + std::vector>> fragment_domains; + for (uint32_t f = 0; f < finfo.fragment_num(); f++) { + std::vector> this_fragment_domain; + for (uint64_t d = 0; d < num_dimensions; d++) { + D bounds[2]; + finfo.get_non_empty_domain(f, d, &bounds[0]); + this_fragment_domain.push_back( + templates::Domain(bounds[0], bounds[1])); + } + fragment_domains.push_back(this_fragment_domain); + } + + // the fragments are not always emitted in the same order, sort them + auto domain_cmp = [&](const auto& left, const auto& right) { + for (uint64_t di = 0; di < num_dimensions; di++) { + const uint64_t d = + (tile_order == sm::Layout::ROW_MAJOR ? di : num_dimensions - di - 1); + if (left[d].lower_bound < right[d].lower_bound) { + return true; + } else if (left[d].lower_bound > right[d].lower_bound) { + return false; + } else if (left[d].upper_bound < right[d].upper_bound) { + return true; + } else if (left[d].upper_bound > right[d].upper_bound) { + return false; + } + } + return false; + }; + std::vector fragments_in_order(finfo.fragment_num()); + std::iota(fragments_in_order.begin(), fragments_in_order.end(), 0); + std::sort( + fragments_in_order.begin(), + fragments_in_order.end(), + [&](const uint32_t f_left, const uint32_t f_right) -> bool { + const auto& left = fragment_domains[f_left]; + const auto& right = fragment_domains[f_right]; + return domain_cmp(left, right); + }); + std::sort(fragment_domains.begin(), fragment_domains.end(), domain_cmp); + + // validate fragment domains + ASSERTER(!fragment_domains.empty()); + + // fragment domains should be contiguous in global order and cover the whole + // subarray + uint64_t subarray_tile_offset = 0; + for (uint32_t f = 0; f < fragments_in_order.size(); f++) { + const sm::NDRange& internal_domain = + finfo.ptr() + ->fragment_info() + ->single_fragment_info_vec()[fragments_in_order[f]] + .non_empty_domain(); + + const uint64_t f_num_tiles = + compute_num_tiles(tile_extents, internal_domain); + const std::optional f_start_tile = compute_start_tile( + tile_order, tile_extents, expect_domain, internal_domain); + + ASSERTER(f_start_tile == subarray_tile_offset); + subarray_tile_offset += f_num_tiles; + } + ASSERTER( + subarray_tile_offset == + compute_num_tiles(tile_extents, expect_domain)); + + auto meta_size = [&](uint32_t f) -> uint64_t { + return finfo.ptr() + ->fragment_info() + ->single_fragment_info_vec()[f] + .meta() + ->fragment_meta_size(); + }; + + // validate fragment size - no fragment should be larger than max requested + // size + for (uint32_t f : fragments_in_order) { + const uint64_t fsize = finfo.fragment_size(f); + const uint64_t fmetasize = meta_size(f); + ASSERTER(fsize <= max_fragment_size + fmetasize); + } + + // validate fragment size - we wrote the largest possible fragments (no two + // adjacent should be under max fragment size) + for (uint32_t fi = 1; fi < fragments_in_order.size(); fi++) { + const uint32_t fprev = fragments_in_order[fi - 1]; + const uint32_t fcur = fragments_in_order[fi]; + const uint64_t combined_size = + finfo.fragment_size(fprev) + finfo.fragment_size(fcur); + const uint64_t combined_meta_size = meta_size(fprev) + meta_size(fcur); + ASSERTER(combined_size > max_fragment_size + combined_meta_size); + } + + return fragment_domains; +} + +} // namespace tiledb::test + +#endif diff --git a/tiledb/common/arithmetic.h b/tiledb/common/arithmetic.h index 08ced92e703..3aa85a6ae58 100644 --- a/tiledb/common/arithmetic.h +++ b/tiledb/common/arithmetic.h @@ -196,6 +196,20 @@ struct checked_arithmetic { return -negated.value(); } } + + /** + * @return `a * b` if it can be represented as a `uint64_t` without undefined + * behavior, `std::nullopt` otherwise + */ + static std::optional mul(uint64_t a, uint64_t b) { + if (b == 0) { + return 0; + } else if (a > std::numeric_limits::max() / b) { + return std::nullopt; + } else { + return a * b; + } + } }; template <> diff --git a/tiledb/sm/fragment/fragment_metadata.cc b/tiledb/sm/fragment/fragment_metadata.cc index ba6fc1b2e9c..1640c0e4d62 100644 --- a/tiledb/sm/fragment/fragment_metadata.cc +++ b/tiledb/sm/fragment/fragment_metadata.cc @@ -673,14 +673,7 @@ uint64_t FragmentMetadata::fragment_size() const { for (const auto& file_validity_size : file_validity_sizes_) size += file_validity_size; - // The fragment metadata file size can be empty when we've loaded consolidated - // metadata - uint64_t meta_file_size = meta_file_size_; - if (meta_file_size == 0) { - auto meta_uri = fragment_uri_.join_path( - std::string(constants::fragment_metadata_filename)); - meta_file_size = resources_->vfs().file_size(meta_uri); - } + const uint64_t meta_file_size = fragment_meta_size(); // Validate that the meta_file_size is not zero, either preloaded or fetched // above iassert(meta_file_size != 0); @@ -691,14 +684,29 @@ uint64_t FragmentMetadata::fragment_size() const { return size; } -void FragmentMetadata::init_domain(const NDRange& non_empty_domain) { - auto& domain{array_schema_->domain()}; +uint64_t FragmentMetadata::fragment_meta_size() const { + // The fragment metadata file size can be empty when we've loaded consolidated + // metadata + if (meta_file_size_ == 0) { + auto meta_uri = fragment_uri_.join_path( + std::string(constants::fragment_metadata_filename)); + meta_file_size_ = resources_->vfs().file_size(meta_uri); + } + return meta_file_size_; +} +void FragmentMetadata::init_domain(const NDRange& non_empty_domain) { // Sanity check iassert(!non_empty_domain.empty()); iassert(non_empty_domain_.empty()); iassert(domain_.empty()); + set_domain(non_empty_domain); +} + +void FragmentMetadata::set_domain(const NDRange& non_empty_domain) { + auto& domain{array_schema_->domain()}; + // Set non-empty domain for dense arrays (for sparse it will be calculated // via the MBRs) if (dense_) { @@ -841,6 +849,32 @@ void FragmentMetadata::load( } void FragmentMetadata::store(const EncryptionKey& encryption_key) { + // integrity checks + if (dense_) { + const uint64_t dense_tile_num = tile_num(); + + for (const auto& tile_offsets : loaded_metadata_ptr_->tile_offsets()) { + iassert(tile_offsets.size() == dense_tile_num); + } + for (const auto& tile_var_offsets : + loaded_metadata_ptr_->tile_var_offsets()) { + iassert(tile_var_offsets.size() == dense_tile_num); + } + for (const auto& tile_var_sizes : loaded_metadata_ptr_->tile_var_sizes()) { + iassert(tile_var_sizes.size() == dense_tile_num); + } + for (const auto& tile_validity_offsets : + loaded_metadata_ptr_->tile_validity_offsets()) { + iassert(tile_validity_offsets.size() == dense_tile_num); + } + for (const auto& tile_null_counts : + loaded_metadata_ptr_->tile_null_counts()) { + if (!tile_null_counts.empty()) { + iassert(tile_null_counts.size() == dense_tile_num); + } + } + } + auto timer_se = resources_->stats().start_timer("write_store_frag_meta"); // Make sure the data fits in the current domain before we commit to disk. @@ -1194,6 +1228,11 @@ void FragmentMetadata::store_v15_or_higher( } void FragmentMetadata::set_num_tiles(uint64_t num_tiles) { + if (dense_) { + const uint64_t dense_tile_num = tile_num(); + iassert(num_tiles <= dense_tile_num); + } + for (auto& it : idx_map_) { auto i = it.second; iassert(num_tiles >= loaded_metadata_ptr_->tile_offsets()[i].size()); diff --git a/tiledb/sm/fragment/fragment_metadata.h b/tiledb/sm/fragment/fragment_metadata.h index 931d6d0099f..fe2e6e69a35 100644 --- a/tiledb/sm/fragment/fragment_metadata.h +++ b/tiledb/sm/fragment/fragment_metadata.h @@ -253,6 +253,9 @@ class FragmentMetadata { /** Retrieves the fragment size. */ uint64_t fragment_size() const; + /** @return the size of the metadata file */ + uint64_t fragment_meta_size() const; + /** * Returns true if the corresponding fragment is dense, and false if it * is sparse. @@ -353,6 +356,12 @@ class FragmentMetadata { */ void init_domain(const NDRange& non_empty_domain); + /** + * Updates the fragment's internal domain and non-empty domain members. + * Validity of the argument is not checked so use with caution. + */ + void set_domain(const NDRange& non_empty_domain); + /** * Loads the basic metadata from storage or `f_buff` for later * versions if it is not `nullptr`. @@ -898,7 +907,7 @@ class FragmentMetadata { uint64_t sparse_tile_num_; /** The size of the fragment metadata file. */ - uint64_t meta_file_size_; + mutable uint64_t meta_file_size_; /** Local mutex for thread-safety. */ std::mutex mtx_; diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 0fbd84db127..751511714ab 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -83,6 +83,13 @@ static uint64_t get_effective_memory_budget( /* CONSTRUCTORS & DESTRUCTORS */ /* ****************************** */ +Query::CoordsInfo::CoordsInfo() + : has_coords_(false) + , coords_buffer_(nullptr) + , coords_buffer_size_(nullptr) + , coords_num_(0) { +} + Query::Query( ContextResources& resources, CancellationSource cancellation_source, @@ -125,7 +132,6 @@ Query::Query( , remote_query_(false) , is_dimension_label_ordered_read_(false) , dimension_label_increasing_(true) - , fragment_size_(std::numeric_limits::max()) , memory_budget_(memory_budget) , query_remote_buffer_storage_(std::nullopt) , default_channel_{make_shared(HERE(), *this, 0)} { @@ -141,11 +147,6 @@ Query::Query( fragment_metadata_ = array->fragment_metadata(); - coords_info_.coords_buffer_ = nullptr; - coords_info_.coords_buffer_size_ = nullptr; - coords_info_.coords_num_ = 0; - coords_info_.has_coords_ = false; - callback_ = nullptr; callback_data_ = nullptr; status_ = QueryStatus::UNINITIALIZED; @@ -1636,10 +1637,19 @@ Status Query::submit() { } // Make sure fragment size is only set for global order. - if (fragment_size_ != std::numeric_limits::max() && - (layout_ != Layout::GLOBAL_ORDER || type_ != QueryType::WRITE)) { - throw QueryException( - "[submit] Fragment size is only supported for global order writes."); + if (fragment_size_.has_value()) { + if (layout_ != Layout::GLOBAL_ORDER || type_ != QueryType::WRITE) { + throw QueryException( + "[submit] Fragment size is only supported for global order writes."); + } else if (array_schema_->dense() && array_->is_remote()) { + // For dense arrays, `max_fragment_size_` requires buffering of a trail of + // filtered tiles which may not fit in a target fragment. This trail of + // tiles is not serializable. As such `max_fragment_size` cannot be used + // with remote dense array writes. + throw QueryException( + "[submit] Fragment size is not supported for remote global order " + "writes to dense arrays."); + } } // Check attribute/dimensions buffers completeness before query submits diff --git a/tiledb/sm/query/query.h b/tiledb/sm/query/query.h index 5d39ed133cf..c33d32bd559 100644 --- a/tiledb/sm/query/query.h +++ b/tiledb/sm/query/query.h @@ -127,6 +127,8 @@ class Query { /** Keeps track of the number of coordinates across coordinate buffers. */ uint64_t coords_num_; + + CoordsInfo(); }; /* ********************************* */ @@ -1108,7 +1110,7 @@ class Query { * * Note: This is only used for global order writes. */ - uint64_t fragment_size_; + std::optional fragment_size_; /** * Memory budget. If set to nullopt, the value will be obtained from the diff --git a/tiledb/sm/query/writers/global_order_writer.cc b/tiledb/sm/query/writers/global_order_writer.cc index 38b3a3523ba..85abf228145 100644 --- a/tiledb/sm/query/writers/global_order_writer.cc +++ b/tiledb/sm/query/writers/global_order_writer.cc @@ -45,13 +45,16 @@ #include "tiledb/sm/misc/parallel_functions.h" #include "tiledb/sm/misc/tdb_math.h" #include "tiledb/sm/misc/tdb_time.h" +#include "tiledb/sm/misc/types.h" #include "tiledb/sm/query/hilbert_order.h" #include "tiledb/sm/query/query_macros.h" #include "tiledb/sm/stats/global_stats.h" +#include "tiledb/sm/tile/arithmetic.h" #include "tiledb/sm/tile/generic_tile_io.h" #include "tiledb/sm/tile/tile_metadata_generator.h" #include "tiledb/sm/tile/writer_tile_tuple.h" #include "tiledb/storage_format/uri/generate_uri.h" +#include "tiledb/type/apply_with_type.h" using namespace tiledb; using namespace tiledb::common; @@ -59,6 +62,98 @@ using namespace tiledb::sm::stats; namespace tiledb::sm { +/** + * See `tiledb/sm/tile/arithmetic.h` function `is_rectangular_domain`. + * + * When writing multiple dense fragments the domain of each fragment + * must accurately reflect the coordinates contained in that fragment. + * This is called in `GlobalOrderWriter::identify_fragment_tile_boundaries` for + * each of the input tiles to determine whether a rectangle is formed and + * including a tile in a fragment is sound. + */ +static IsRectangularDomain is_rectangular_domain( + const ArraySchema& arrayschema, + const NDRange& domain, + uint64_t start_tile, + uint64_t num_tiles) { + const Domain& arraydomain = arrayschema.domain(); + + // NB: ordinary write subarray must be tile aligned but the consolidation + // subarray is not required to be + NDRange arraydomain_aligned = domain; + arraydomain.expand_to_tiles_when_no_current_domain(arraydomain_aligned); + + auto impl = [&](T) { + if constexpr (TileDBIntegral) { + std::vector tile_extents; + tile_extents.reserve(arraydomain.dim_num()); + for (uint64_t d = 0; d < arraydomain.dim_num(); d++) { + tile_extents.push_back(arraydomain.tile_extent(d).rvalue_as()); + } + + return is_rectangular_domain( + arrayschema.tile_order(), + tile_extents, + arraydomain_aligned, + start_tile, + num_tiles); + } else { + return IsRectangularDomain::Never; + } + }; + return apply_with_type(impl, arraydomain.dimension_ptr(0)->type()); +} + +/** + * See `tiledb/sm/tile/arithmetic.h` function `domain_tile_offset`. + * + * When writing multiple dense fragments the domain of each fragment + * must accurately reflect the coordinates contained in that fragment. + * This is called when starting a new fragment to update the domain of the + * previous fragment and set the correct starting domain of the new one. + */ +static std::optional domain_tile_offset( + const ArraySchema& arrayschema, + const NDRange& domain, + uint64_t start_tile, + uint64_t num_tiles) { + const Domain& arraydomain = arrayschema.domain(); + + // NB: ordinary write subarray must be tile aligned but the consolidation + // subarray is not required to be. Align for purposes of tile arithmetic. + NDRange arraydomain_aligned = domain; + arraydomain.expand_to_tiles_when_no_current_domain(arraydomain_aligned); + + auto impl = [&](T) { + if constexpr (TileDBIntegral) { + std::vector tile_extents; + tile_extents.reserve(arraydomain.dim_num()); + for (uint64_t d = 0; d < arraydomain.dim_num(); d++) { + tile_extents.push_back(arraydomain.tile_extent(d).rvalue_as()); + } + + std::optional r = domain_tile_offset( + arrayschema.tile_order(), + tile_extents, + arraydomain_aligned, + start_tile, + num_tiles); + if (r.has_value()) { + // aligning to the array domain may have extended beyond the subarray, + // clamp the result back within the subarray bounds + for (uint64_t d = 0; d < arraydomain.dim_num(); d++) { + tiledb::type::crop_range(domain[d], r.value()[d]); + } + } + return r; + } else { + return std::optional{}; + } + }; + + return apply_with_type(impl, arraydomain.dimension_ptr(0)->type()); +} + class GlobalOrderWriterException : public StatusException { public: explicit GlobalOrderWriterException(const std::string& message) @@ -74,7 +169,7 @@ GlobalOrderWriter::GlobalOrderWriter( stats::Stats* stats, shared_ptr logger, StrategyParams& params, - uint64_t fragment_size, + std::optional fragment_size, std::vector& written_fragment_info, bool disable_checks_consolidation, std::vector& processed_conditions, @@ -91,7 +186,7 @@ GlobalOrderWriter::GlobalOrderWriter( remote_query, fragment_name) , processed_conditions_(processed_conditions) - , fragment_size_(fragment_size) + , max_fragment_size_(fragment_size) , current_fragment_size_(0) { // Check the layout is global order. if (layout_ != Layout::GLOBAL_ORDER) { @@ -116,6 +211,7 @@ GlobalOrderWriter::GlobalWriteState::GlobalWriteState( : last_tiles_(memory_tracker->get_resource(MemoryType::WRITER_TILE_DATA)) , last_var_offsets_(memory_tracker->get_resource(MemoryType::WRITER_DATA)) , cells_written_(memory_tracker->get_resource(MemoryType::WRITER_DATA)) { + dense_.domain_tile_offset_ = 0; } /* ****************************** */ @@ -202,7 +298,7 @@ Status GlobalOrderWriter::init_global_write_state() { const auto& domain{array_schema_.domain()}; const auto capacity = array_schema_.capacity(); const auto cell_num_per_tile = - coords_info_.has_coords_ ? capacity : domain.cell_num_per_tile(); + dense() ? domain.cell_num_per_tile() : capacity; auto last_tiles_it = global_write_state_->last_tiles_.emplace( std::piecewise_construct, std::forward_as_tuple(name), @@ -231,6 +327,11 @@ GlobalOrderWriter::GlobalWriteState* GlobalOrderWriter::get_global_state() { return global_write_state_.get(); } +const GlobalOrderWriter::GlobalWriteState* GlobalOrderWriter::get_global_state() + const { + return global_write_state_.get(); +} + std::pair> GlobalOrderWriter::multipart_upload_state(bool client) { if (client) { @@ -388,7 +489,7 @@ Status GlobalOrderWriter::check_global_order() const { } // Applicable only to sparse writes - exit if coordinates do not exist - if (!coords_info_.has_coords_ || coords_info_.coords_num_ == 0) { + if (dense() || coords_info_.coords_num_ == 0) { return Status::Ok(); } @@ -498,12 +599,14 @@ Status GlobalOrderWriter::check_global_order_hilbert() const { void GlobalOrderWriter::clean_up() { if (global_write_state_ != nullptr) { - const auto& uri = global_write_state_->frag_meta_->fragment_uri(); + if (global_write_state_->frag_meta_) { + const auto& uri = global_write_state_->frag_meta_->fragment_uri(); - // Cleanup the fragment we are currently writing. There is a chance that the - // URI is empty if creating the first fragment had failed. - if (!uri.empty()) { - resources_.vfs().remove_dir(uri); + // Cleanup the fragment we are currently writing. There is a chance that + // the URI is empty if creating the first fragment had failed. + if (!uri.empty()) { + resources_.vfs().remove_dir(uri); + } } global_write_state_.reset(nullptr); @@ -516,27 +619,37 @@ void GlobalOrderWriter::clean_up() { } Status GlobalOrderWriter::filter_last_tiles(uint64_t cell_num) { + const uint64_t last_tile_offset = + global_write_state_->last_tiles_.begin()->second.size() - 1; + // Adjust cell num for (auto& last_tiles : global_write_state_->last_tiles_) { - last_tiles.second[0].set_final_size(cell_num); + last_tiles.second.back()->set_final_size(cell_num); } // Compute coordinates metadata auto meta = global_write_state_->frag_meta_; - auto mbrs = compute_mbrs(global_write_state_->last_tiles_); + auto mbrs = compute_mbrs( + last_tile_offset, last_tile_offset + 1, global_write_state_->last_tiles_); set_coords_metadata(0, 1, global_write_state_->last_tiles_, mbrs, meta); // Compute tile metadata. - RETURN_NOT_OK(compute_tiles_metadata(1, global_write_state_->last_tiles_)); + RETURN_NOT_OK(compute_tiles_metadata( + last_tile_offset, + last_tile_offset + 1, + global_write_state_->last_tiles_)); // Gather stats stats_->add_counter( "cell_num", - global_write_state_->last_tiles_.begin()->second[0].cell_num()); + global_write_state_->last_tiles_.begin()->second.back()->cell_num()); stats_->add_counter("tile_num", 1); // Filter tiles - RETURN_NOT_OK(filter_tiles(&global_write_state_->last_tiles_)); + RETURN_NOT_OK(filter_tiles( + last_tile_offset, + last_tile_offset + 1, + &global_write_state_->last_tiles_)); return Status::Ok(); } @@ -625,19 +738,69 @@ Status GlobalOrderWriter::compute_coord_dups( Status GlobalOrderWriter::finalize_global_write_state() { iassert(layout_ == Layout::GLOBAL_ORDER, "layout = {}", layout_str(layout_)); - auto meta = global_write_state_->frag_meta_; - const auto& uri = meta->fragment_uri(); + + // For dense, there may be prepared tiles which have not been flushed yet + if (dense()) { + const uint64_t num_remaining = + global_write_state_->last_tiles_.begin()->second.size() - 1; + if (num_remaining > 0) { + iassert(global_write_state_->frag_meta_); + throw_if_not_ok(populate_fragment( + global_write_state_->last_tiles_, 0, num_remaining)); + + // FIXME: there is a possibility here that we write a tile bigger than the + // max fragment size if these remaining tiles fill it up and then the last + // tile runs over... in this case we need to do the rectangle thing all + // over again so as to avoid writing a fragment which exceeds the max + // fragment size. + // + // HOWEVER, this state might not be reachable, because dense global + // order writes must be fully tile-aligned, which means that the + // "last tile" which we would flush here should have zero cells. + // Note that the subarray is a rectangle, so + // `identify_fragment_tile_boundaries` should always indicate that all of + // the tiles can be written. + // + // As such we are not going to expend more effort on this unless + // we see evidence of it. + } + } else { + iassert(global_write_state_->last_tiles_.begin()->second.size() <= 1); + } // Handle last tile Status st = global_write_handle_last_tile(); + auto meta = global_write_state_->frag_meta_; + if (!st.ok()) { - throw_if_not_ok(close_files(meta)); + if (meta) { + throw_if_not_ok(close_files(meta)); + } return st; } + if (!meta) { + return Status::Ok(); + } + + const auto& uri = meta->fragment_uri(); + // Close all files RETURN_NOT_OK(close_files(meta)); + // Update dense fragment domain + if (dense()) { + const uint64_t num_tiles_in_fragment = + meta->loaded_metadata()->tile_offsets()[0].size(); + std::optional fragment_domain = domain_tile_offset( + array_schema_, + subarray_.ndrange(0), + global_write_state_->dense_.domain_tile_offset_, + num_tiles_in_fragment); + iassert(fragment_domain.has_value()); + meta->set_domain(std::move(fragment_domain.value())); + } + // Check that the same number of cells was written across attributes // and dimensions auto cell_num = global_write_state_->cells_written_[buffers_.begin()->first]; @@ -656,7 +819,7 @@ Status GlobalOrderWriter::finalize_global_write_state() { } // Check if the total number of cells written is equal to the subarray size - if (!coords_info_.has_coords_) { // This implies a dense array + if (dense()) { auto& domain{array_schema_.domain()}; auto expected_cell_num = domain.cell_num(subarray_.ndrange(0)); @@ -720,6 +883,21 @@ Status GlobalOrderWriter::finalize_global_write_state() { return st; } +Status GlobalOrderWriter::populate_fragment( + tdb::pmr::unordered_map& tiles, + uint64_t tile_offset, + uint64_t num_tiles) { + auto frag_meta = global_write_state_->frag_meta_; + + // write tiles for all attributes + RETURN_CANCEL_OR_ERROR( + write_tiles(tile_offset, tile_offset + num_tiles, frag_meta, &tiles)); + + frag_meta->set_tile_index_base(frag_meta->tile_index_base() + num_tiles); + + return Status::Ok(); +} + Status GlobalOrderWriter::global_write() { // Applicable only to global write on dense/sparse arrays iassert(layout_ == Layout::GLOBAL_ORDER, "layout = {}", layout_str(layout_)); @@ -727,8 +905,7 @@ Status GlobalOrderWriter::global_write() { // Initialize the global write state if this is the first invocation if (!global_write_state_) { RETURN_CANCEL_OR_ERROR(alloc_global_write_state()); - RETURN_CANCEL_OR_ERROR(create_fragment( - !coords_info_.has_coords_, global_write_state_->frag_meta_)); + RETURN_NOT_OK(create_fragment(dense(), global_write_state_->frag_meta_)); RETURN_CANCEL_OR_ERROR(init_global_write_state()); } @@ -748,71 +925,93 @@ Status GlobalOrderWriter::global_write() { query_memory_tracker_->get_resource(MemoryType::WRITER_TILE_DATA)); RETURN_CANCEL_OR_ERROR(prepare_full_tiles(coord_dups, &tiles)); - // Find number of tiles and gather stats - uint64_t tile_num = 0; - if (!tiles.empty()) { - auto it = tiles.begin(); - tile_num = it->second.size(); - - uint64_t cell_num = 0; - for (size_t t = 0; t < tile_num; ++t) { - cell_num += it->second[t].cell_num(); - } - stats_->add_counter("cell_num", cell_num); - stats_->add_counter("tile_num", tile_num); - } - - // No cells to be written + uint64_t tile_num = (tiles.empty() ? 0 : tiles.begin()->second.size()); if (tile_num == 0) { return Status::Ok(); } + // Compute tile metadata. + RETURN_CANCEL_OR_ERROR(compute_tiles_metadata(tiles)); + // Compute coordinate metadata (if coordinates are present) auto mbrs = compute_mbrs(tiles); - // Compute tile metadata. - RETURN_CANCEL_OR_ERROR(compute_tiles_metadata(tile_num, tiles)); + RETURN_NOT_OK(filter_tiles(&tiles)); - // Filter all tiles - RETURN_CANCEL_OR_ERROR(filter_tiles(&tiles)); + // include any prepared tiles from previous `submit` which were not flushed + for (const auto& it : buffers_) { + auto& last = global_write_state_->last_tiles_.at(it.first); + if (!last.empty()) { + const uint64_t num_leftover = last.size() - 1; + tiles.at(it.first).splice( + tiles.at(it.first).begin(), + last, + last.begin(), + std::next(last.begin(), num_leftover)); + } + } + tile_num = (tiles.empty() ? 0 : tiles.begin()->second.size()); + + const auto fragments = identify_fragment_tile_boundaries(tiles); + + for (uint64_t f = 0; f < fragments.tile_offsets_.size(); f++) { + const uint64_t input_start_tile = fragments.tile_offsets_[f]; + const uint64_t input_num_tiles = (f + 1 < fragments.tile_offsets_.size() ? + fragments.tile_offsets_[f + 1] : + fragments.num_writeable_tiles_) - + input_start_tile; + + if (input_num_tiles == 0) { + // this should only happen if there is only one tile of input and we have + // to wait for finalize, or if continuing a fragment from a previous write + // and there is no more room + iassert(f == 0); + if (current_fragment_size_ == 0) { + iassert(fragments.tile_offsets_.size() == 1); + } + } else { + if (f > 0 || !global_write_state_->frag_meta_) { + RETURN_CANCEL_OR_ERROR(start_new_fragment()); + } - uint64_t idx = 0; - while (idx < tile_num) { - auto frag_meta = global_write_state_->frag_meta_; + global_write_state_->frag_meta_->set_num_tiles( + global_write_state_->frag_meta_->tile_index_base() + input_num_tiles); - // Compute the number of tiles that will fit in this fragment. - auto num = num_tiles_to_write(idx, tile_num, tiles); + set_coords_metadata( + input_start_tile, + input_start_tile + input_num_tiles, + tiles, + mbrs, + global_write_state_->frag_meta_); - // If we're resuming a fragment write and the first tile doesn't fit into - // the previous fragment, we need to start a new fragment and recalculate - // the number of tiles to write. - if (current_fragment_size_ > 0 && num == 0) { - RETURN_CANCEL_OR_ERROR(start_new_fragment()); - num = num_tiles_to_write(idx, tile_num, tiles); + RETURN_CANCEL_OR_ERROR( + populate_fragment(tiles, input_start_tile, input_num_tiles)); } + } - // Set new number of tiles in the fragment metadata - auto new_num_tiles = frag_meta->tile_index_base() + num; - frag_meta->set_num_tiles(new_num_tiles); + current_fragment_size_ = fragments.last_fragment_size_; - if (new_num_tiles == 0) { - throw GlobalOrderWriterException( - "Fragment size is too small to write a single tile"); - } + if (fragments.num_writeable_tiles_ < tile_num) { + // sparse array should be able to write everything + iassert(dense()); - set_coords_metadata(idx, idx + num, tiles, mbrs, frag_meta); + const uint64_t offset_not_written = fragments.num_writeable_tiles_; - // Write tiles for all attributes - RETURN_CANCEL_OR_ERROR(write_tiles(idx, idx + num, frag_meta, &tiles)); - idx += num; + // Dense array does not have bounding rectangles. + // If there were any other tile metadata which we needed to draw from the + // un-filtered tiles, we would have to store that in the global write state + // here. But there is no other such metadata. + iassert(mbrs.empty()); - // If we didn't write all tiles, close this fragment and start another. - if (idx != tile_num) { - RETURN_CANCEL_OR_ERROR(start_new_fragment()); + // buffer tiles which couldn't fit in memory + for (auto& attr : tiles) { + auto& last = global_write_state_->last_tiles_.at(attr.first); + last.splice( + last.begin(), + attr.second, + std::next(attr.second.begin(), offset_not_written), + attr.second.end()); } - - // Increment the tile index base for the next global order write. - frag_meta->set_tile_index_base(new_num_tiles); } return Status::Ok(); @@ -821,16 +1020,22 @@ Status GlobalOrderWriter::global_write() { Status GlobalOrderWriter::global_write_handle_last_tile() { auto capacity = array_schema_.capacity(); auto& domain = array_schema_.domain(); - auto cell_num_per_tile = - coords_info_.has_coords_ ? capacity : domain.cell_num_per_tile(); + auto cell_num_per_tile = dense() ? domain.cell_num_per_tile() : capacity; auto cell_num_last_tiles = global_write_state_->cells_written_[buffers_.begin()->first] % cell_num_per_tile; if (cell_num_last_tiles == 0) return Status::Ok(); + // if we haven't started a fragment yet, now is the time + // (this can happen if the writes do not fill a full tile) + if (!global_write_state_->frag_meta_) { + RETURN_CANCEL_OR_ERROR(start_new_fragment()); + } + // Reserve space for the last tile in the fragment metadata auto meta = global_write_state_->frag_meta_; + iassert(meta); meta->set_num_tiles(meta->tile_index_base() + 1); // Filter last tiles @@ -906,8 +1111,7 @@ Status GlobalOrderWriter::prepare_full_tiles_fixed( auto capacity = array_schema_.capacity(); auto cell_num = *buffer_size / cell_size; auto& domain{array_schema_.domain()}; - auto cell_num_per_tile = - coords_info_.has_coords_ ? capacity : domain.cell_num_per_tile(); + auto cell_num_per_tile = dense() ? domain.cell_num_per_tile() : capacity; // Do nothing if there are no cells to write if (cell_num == 0) { @@ -915,7 +1119,7 @@ Status GlobalOrderWriter::prepare_full_tiles_fixed( } // First fill the last tile - auto& last_tile = global_write_state_->last_tiles_.at(name)[0]; + auto& last_tile = *global_write_state_->last_tiles_.at(name).back(); uint64_t cell_idx = 0; uint64_t last_tile_cell_idx = global_write_state_->cells_written_[name] % cell_num_per_tile; @@ -1087,8 +1291,7 @@ Status GlobalOrderWriter::prepare_full_tiles_var( auto capacity = array_schema_.capacity(); auto cell_num = buffer_size / constants::cell_var_offset_size; auto& domain{array_schema_.domain()}; - auto cell_num_per_tile = - coords_info_.has_coords_ ? capacity : domain.cell_num_per_tile(); + auto cell_num_per_tile = dense() ? domain.cell_num_per_tile() : capacity; auto attr_datatype_size = datatype_size(array_schema_.type(name)); // Do nothing if there are no cells to write @@ -1096,7 +1299,7 @@ Status GlobalOrderWriter::prepare_full_tiles_var( return Status::Ok(); // First fill the last tile - auto& last_tile = global_write_state_->last_tiles_.at(name)[0]; + auto& last_tile = *global_write_state_->last_tiles_.at(name).back(); auto& last_var_offset = global_write_state_->last_var_offsets_[name]; uint64_t cell_idx = 0; uint64_t last_tile_cell_idx = @@ -1371,92 +1574,206 @@ Status GlobalOrderWriter::prepare_full_tiles_var( return Status::Ok(); } -uint64_t GlobalOrderWriter::num_tiles_to_write( - uint64_t start, - uint64_t tile_num, - tdb::pmr::unordered_map& tiles) { +/** + * Identifies the division of input cells into target fragments, + * using `max_fragment_size_` as a hard limit on the target fragment size. + * + * `current_fragment_size_` may be nonzero if continuing a fragment from + * a previous `submit()`, so this field is used to initialize the fragment size + * before the first tile is examined. + * + * @param tiles + * + * @return a list of (fragment size, tile offset) pairs identifying the division + * of input data into target fragments + */ +GlobalOrderWriter::FragmentTileBoundaries +GlobalOrderWriter::identify_fragment_tile_boundaries( + const tdb::pmr::unordered_map& tiles) + const { // Cache variables to prevent map lookups. const auto buf_names = buffer_names(); - std::vector var_size; - std::vector nullable; - std::vector writer_tile_vectors; - var_size.reserve(buf_names.size()); - nullable.reserve(buf_names.size()); + std::vector writer_tile_vectors; writer_tile_vectors.reserve(buf_names.size()); for (auto& name : buf_names) { - var_size.emplace_back(array_schema_.var_size(name)); - nullable.emplace_back(array_schema_.is_nullable(name)); writer_tile_vectors.emplace_back(&tiles.at(name)); } + // Find number of tiles and gather stats + uint64_t tile_num = 0; + if (!tiles.empty()) { + auto it = tiles.begin(); + tile_num = it->second.size(); + + uint64_t cell_num = 0; + for (size_t t = 0; t < tile_num; ++t) { + cell_num += it->second[t].cell_num(); + } + stats_->add_counter("cell_num", cell_num); + stats_->add_counter("tile_num", tile_num); + } + + uint64_t running_tiles_size = current_fragment_size_; + uint64_t fragment_size = current_fragment_size_; + + uint64_t write_state_start_tile = + global_write_state_->dense_.domain_tile_offset_; + uint64_t current_fragment_num_tiles_already_written = 0; + if (dense() && global_write_state_->frag_meta_) { + current_fragment_num_tiles_already_written = + global_write_state_->frag_meta_->tile_index_base(); + } + + uint64_t fragment_start = 0; + std::vector fragments; + + // NB: this really wants to be `std::option` but some versions of gcc have a + // false positive uninitialized use warning + int64_t fragment_end = -1; + // Make sure we don't write more than the desired fragment size. - for (uint64_t t = start; t < tile_num; t++) { + for (uint64_t t = 0; t < tile_num; t++) { uint64_t tile_size = 0; for (uint64_t a = 0; a < buf_names.size(); a++) { - if (var_size[a]) { - tile_size += writer_tile_vectors[a] - ->at(t) - .offset_tile() - .filtered_buffer() - .size(); - tile_size += - writer_tile_vectors[a]->at(t).var_tile().filtered_buffer().size(); - } else { - tile_size += - writer_tile_vectors[a]->at(t).fixed_tile().filtered_buffer().size(); - } + tile_size += writer_tile_vectors[a]->at(t).filtered_size().value(); + } + + if (tile_size > + max_fragment_size_.value_or(std::numeric_limits::max())) { + throw GlobalOrderWriterException( + "Fragment size is too small to write a single tile"); + } - if (nullable[a]) { - tile_size += writer_tile_vectors[a] - ->at(t) - .validity_tile() - .filtered_buffer() - .size(); + bool should_start_new_fragment = false; + + // NB: normally this should only hit once, but if there is a single + // tile larger than the max fragment size it could hit twice and error + if (running_tiles_size + tile_size > + max_fragment_size_.value_or(std::numeric_limits::max())) { + if (fragment_end < 0) { + if (fragment_size == 0) { + throw GlobalOrderWriterException( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + } } + + should_start_new_fragment = true; + } else if (dense() && max_fragment_size_.has_value()) { + // Dense fragments must have a rectangular domain. + // And all fragments must be smaller than `max_fragment_size_`. + // We must identify the highest tile number which satisfies both criteria. + const uint64_t fragment_start_tile = + write_state_start_tile + fragment_start; + const uint64_t maybe_num_tiles = + current_fragment_num_tiles_already_written + t - fragment_start + 1; + should_start_new_fragment = + (is_rectangular_domain( + array_schema_, + subarray_.ndrange(0), + fragment_start_tile, + maybe_num_tiles) == IsRectangularDomain::Never); } - if (current_fragment_size_ + tile_size > fragment_size_) { - return t - start; + if (should_start_new_fragment) { + fragments.push_back(fragment_start); + + iassert(running_tiles_size >= fragment_size); + running_tiles_size -= fragment_size; + + fragment_start = + static_cast(std::max(0, fragment_end)); + fragment_end = -1; + + write_state_start_tile += current_fragment_num_tiles_already_written; + current_fragment_num_tiles_already_written = 0; + } + + bool extends_fragment = true; + if (dense() && max_fragment_size_.has_value()) { + // Dense fragments must have a rectangular domain. + // And all fragments must be smaller than `max_fragment_size_`. + // We must identify the highest tile number which satisfies both criteria. + const uint64_t fragment_start_tile = + write_state_start_tile + fragment_start; + const uint64_t maybe_num_tiles = + current_fragment_num_tiles_already_written + t - fragment_start + 1; + extends_fragment = + (is_rectangular_domain( + array_schema_, + subarray_.ndrange(0), + fragment_start_tile, + maybe_num_tiles) == IsRectangularDomain::Yes); } + if (extends_fragment) { + fragment_size = running_tiles_size + tile_size; + fragment_end = static_cast(t + 1); + } + + running_tiles_size += tile_size; + } - current_fragment_size_ += tile_size; + if (fragment_end >= 0) { + fragments.push_back(fragment_start); } - return tile_num - start; + return GlobalOrderWriter::FragmentTileBoundaries{ + .tile_offsets_ = fragments, + .num_writeable_tiles_ = + (fragment_end < 0 ? fragment_start : + static_cast(fragment_end)), + .last_fragment_size_ = fragment_size}; } Status GlobalOrderWriter::start_new_fragment() { - auto frag_meta = global_write_state_->frag_meta_; - auto& uri = frag_meta->fragment_uri(); + // finish off current fragment if there is one + if (global_write_state_->frag_meta_) { + auto frag_meta = global_write_state_->frag_meta_; + auto& uri = frag_meta->fragment_uri(); - // Close all files - RETURN_NOT_OK(close_files(frag_meta)); + // Close all files + RETURN_NOT_OK(close_files(frag_meta)); - // Set the processed conditions - frag_meta->set_processed_conditions(processed_conditions_); + // Update dense fragment domain + if (dense()) { + const uint64_t num_tiles_in_fragment = + frag_meta->loaded_metadata()->tile_offsets()[0].size(); + std::optional fragment_domain = domain_tile_offset( + array_schema_, + subarray_.ndrange(0), + global_write_state_->dense_.domain_tile_offset_, + num_tiles_in_fragment); + iassert(fragment_domain.has_value()); + frag_meta->set_domain(std::move(fragment_domain.value())); - // Compute fragment min/max/sum/null count - frag_meta->compute_fragment_min_max_sum_null_count(); + global_write_state_->dense_.domain_tile_offset_ += num_tiles_in_fragment; + } + + // Set the processed conditions + frag_meta->set_processed_conditions(processed_conditions_); - // Flush fragment metadata to storage - frag_meta->store(array_->get_encryption_key()); + // Compute fragment min/max/sum/null count + frag_meta->compute_fragment_min_max_sum_null_count(); - frag_uris_to_commit_.emplace_back(uri); + // Flush fragment metadata to storage + frag_meta->store(array_->get_encryption_key()); - // Make a new fragment URI. - const auto write_version = array_->array_schema_latest().write_version(); - auto frag_dir_uri = - array_->array_directory().get_fragments_dir(write_version); - auto new_fragment_str = storage_format::generate_timestamped_name( - fragment_timestamp_range_.first, - fragment_timestamp_range_.second, - write_version); - fragment_uri_ = frag_dir_uri.join_path(new_fragment_str); + frag_uris_to_commit_.emplace_back(uri); + + // Make a new fragment URI. + const auto write_version = array_->array_schema_latest().write_version(); + auto frag_dir_uri = + array_->array_directory().get_fragments_dir(write_version); + auto new_fragment_str = storage_format::generate_timestamped_name( + fragment_timestamp_range_.first, + fragment_timestamp_range_.second, + write_version); + fragment_uri_ = frag_dir_uri.join_path(new_fragment_str); + } // Create a new fragment. current_fragment_size_ = 0; - RETURN_NOT_OK(create_fragment( - !coords_info_.has_coords_, global_write_state_->frag_meta_)); + RETURN_NOT_OK(create_fragment(dense(), global_write_state_->frag_meta_)); return Status::Ok(); } diff --git a/tiledb/sm/query/writers/global_order_writer.h b/tiledb/sm/query/writers/global_order_writer.h index c15b81f67c8..b3eff76d081 100644 --- a/tiledb/sm/query/writers/global_order_writer.h +++ b/tiledb/sm/query/writers/global_order_writer.h @@ -75,6 +75,16 @@ class GlobalOrderWriter : public WriterBase { * attributes/dimensions, the first tile is the offsets tile, whereas the * second tile is the values tile. In both cases, the third tile stores a * validity tile for nullable attributes. + * + * For sparse arrays, each `WriterTileTupleVector` contains up to one tile, + * which is the data from the previous `submit` which did not fill a tile. + * + * For dense arrays, each `WriterTileTupleVector` contains any tiles which + * were not guaranteed to fit into `max_fragment_size_` while also forming + * a bounding rectangle. Written fragments always have a rectangular domain, + * and it is necessary to buffer tiles this way to avoid flushing data + * which might later require a fragment to exceed `max_fragment_size_` + * in order to represent a rectangular domain. */ tdb::pmr::unordered_map last_tiles_; @@ -108,6 +118,36 @@ class GlobalOrderWriter : public WriterBase { */ std::unordered_map multipart_upload_state_; + + /** + * State for writing dense fragments. + * + * Dense fragments use the bounding rectangle as a precise determination + * of where the contents of the fragment are in the domain, and as such + * it must be written correctly. This is usually not a problem, however + * global order writes can: + * 1) split up a single write into multiple fragments in order to satisfy + * the `max_fragment_size_` parameter + * 2) write into a single domain over the course of multiple `submit` + * calls which each write an arbitrary subset of the domain, + * re-using the buffers + * + * Both of these make it non-trivial to determine what the domain written + * into a fragment actually was, when the fragment fills up + * `max_fragment_size`. + * + * The fields of this struct, as well as `last_tiles_` of the outer struct, + * are used to track the amount of data which the writer has already + * processed so as to keep the correct position in the target subarray. + */ + struct DenseWriteState { + /** + * Tile offset in the subarray domain which the current fragment began + * writing to. + */ + uint64_t domain_tile_offset_; + }; + DenseWriteState dense_; }; /* ********************************* */ @@ -119,7 +159,7 @@ class GlobalOrderWriter : public WriterBase { stats::Stats* stats, shared_ptr logger, StrategyParams& params, - uint64_t fragment_size, + std::optional fragment_size, std::vector& written_fragment_info, bool disable_checks_consolidation, std::vector& processed_conditions, @@ -158,6 +198,9 @@ class GlobalOrderWriter : public WriterBase { /** Returns a bare pointer to the global state. */ GlobalWriteState* get_global_state(); + /** Returns a bare pointer to the global state. */ + const GlobalWriteState* get_global_state() const; + /** * Used in serialization to share the multipart upload state * among cloud executors @@ -208,7 +251,7 @@ class GlobalOrderWriter : public WriterBase { * The desired fragment size, in bytes. The writer will create a new fragment * once this size has been reached. */ - uint64_t fragment_size_; + std::optional max_fragment_size_; /** * Size currently written to the fragment. @@ -371,19 +414,59 @@ class GlobalOrderWriter : public WriterBase { WriterTileTupleVector* tiles) const; /** - * Return the number of tiles to write depending on the desired fragment - * size. The tiles passed in as an argument should have already been - * filtered. + * Contains the return values of + * `GlobalOrderWriter::identify_fragment_tile_boundaries`. + */ + struct FragmentTileBoundaries { + /** + * The offsets where each complete fragment starts. + */ + std::vector tile_offsets_; + + /** + * The number of writeable tiles. + * For sparse arrays this is the number of tiles of input. + * For dense arrays this may be less if there is a trail of tiles which + * cannot be guaranteed to fit within `max_fragment_size` while also forming + * a rectangular domain. + */ + uint64_t num_writeable_tiles_; + + /** + * The size in bytes of the filtered tiles which are written to the last + * fragment. The last fragment may be resumed by a subsequent `submit`. + */ + uint64_t last_fragment_size_; + }; + + /** + * Identify the manner in which the filtered input tiles map onto target + * fragments. If `max_fragment_size_` is much larger than the input, this may + * return just one result. + * + * Each element of the returned vector is a pair `(fragment_size, start_tile)` + * indicating the size of the fragment, and the first tile offset which + * corresponds to that fragment. * - * @param start Current tile index. - * @param tile_num Number of tiles in the tiles vectors. * @param tiles Map of vector of tiles, per attributes. - * @return Number of tiles to write. + * + * @return see `FragmentTileBoundaries` documentation */ - uint64_t num_tiles_to_write( - uint64_t start, - uint64_t tile_num, - tdb::pmr::unordered_map& tiles); + FragmentTileBoundaries identify_fragment_tile_boundaries( + const tdb::pmr::unordered_map& tiles) + const; + + /** + * Writes cells from the indicated slice of `tiles` into the current fragment. + * + * @param tiles the source of cells organized into filtered tiles + * @param tile_offset the tile from which to begin writing + * @param num_tiles the number of tiles to write + */ + Status populate_fragment( + tdb::pmr::unordered_map& tiles, + uint64_t tile_offset, + uint64_t num_tiles); /** * Close the current fragment and start a new one. The closed fragment will @@ -391,6 +474,13 @@ class GlobalOrderWriter : public WriterBase { * be written at once. */ Status start_new_fragment(); + + /** + * @return true if this write is to a dense fragment + */ + bool dense() const { + return !coords_info_.has_coords_; + } }; } // namespace sm diff --git a/tiledb/sm/query/writers/unordered_writer.cc b/tiledb/sm/query/writers/unordered_writer.cc index ed568d2fa87..4b8ea2d6b45 100644 --- a/tiledb/sm/query/writers/unordered_writer.cc +++ b/tiledb/sm/query/writers/unordered_writer.cc @@ -699,7 +699,7 @@ Status UnorderedWriter::unordered_write() { } // Compute tile metadata. - RETURN_CANCEL_OR_ERROR(compute_tiles_metadata(tile_num, tiles)); + RETURN_CANCEL_OR_ERROR(compute_tiles_metadata(tiles)); // Filter all tiles RETURN_CANCEL_OR_ERROR(filter_tiles(&tiles)); diff --git a/tiledb/sm/query/writers/writer_base.cc b/tiledb/sm/query/writers/writer_base.cc index 3e13b990dc9..e0fcd5683c0 100644 --- a/tiledb/sm/query/writers/writer_base.cc +++ b/tiledb/sm/query/writers/writer_base.cc @@ -614,6 +614,8 @@ Status WriterBase::close_files(shared_ptr meta) const { } std::vector WriterBase::compute_mbrs( + uint64_t start_tile_idx, + uint64_t end_tile_idx, const tdb::pmr::unordered_map& tiles) const { auto timer_se = stats_->start_timer("compute_coord_meta"); @@ -628,16 +630,13 @@ std::vector WriterBase::compute_mbrs( return std::vector(); } - // Compute number of tiles. Assumes all attributes and - // and dimensions have the same number of tiles - auto tile_num = tiles.begin()->second.size(); auto dim_num = array_schema_.dim_num(); // Compute MBRs - std::vector mbrs(tile_num); - auto status = - parallel_for(&resources_.compute_tp(), 0, tile_num, [&](uint64_t i) { - mbrs[i].resize(dim_num); + std::vector mbrs(end_tile_idx - start_tile_idx); + auto status = parallel_for( + &resources_.compute_tp(), start_tile_idx, end_tile_idx, [&](uint64_t i) { + mbrs[i - start_tile_idx].resize(dim_num); std::vector data(dim_num); for (unsigned d = 0; d < dim_num; ++d) { auto dim{array_schema_.dimension_ptr(d)}; @@ -689,12 +688,13 @@ void WriterBase::set_coords_metadata( } Status WriterBase::compute_tiles_metadata( - uint64_t tile_num, + uint64_t start_tile_idx, + uint64_t end_tile_idx, tdb::pmr::unordered_map& tiles) const { auto* compute_tp = &resources_.compute_tp(); // Parallelize over attributes? - if (tiles.size() > tile_num) { + if (tiles.size() > (end_tile_idx - start_tile_idx)) { auto st = parallel_for(compute_tp, 0, tiles.size(), [&](uint64_t i) { auto tiles_it = tiles.begin(); std::advance(tiles_it, i); @@ -724,14 +724,15 @@ Status WriterBase::compute_tiles_metadata( const auto var_size = array_schema_.var_size(attr); const auto cell_size = array_schema_.cell_size(attr); const auto cell_val_num = array_schema_.cell_val_num(attr); - auto st = parallel_for(compute_tp, 0, tile_num, [&](uint64_t t) { - TileMetadataGenerator md_generator( - type, is_dim, var_size, cell_size, cell_val_num); - md_generator.process_full_tile(attr_tiles[t]); - md_generator.set_tile_metadata(attr_tiles[t]); + auto st = parallel_for( + compute_tp, start_tile_idx, end_tile_idx, [&](uint64_t t) { + TileMetadataGenerator md_generator( + type, is_dim, var_size, cell_size, cell_val_num); + md_generator.process_full_tile(attr_tiles[t]); + md_generator.set_tile_metadata(attr_tiles[t]); - return Status::Ok(); - }); + return Status::Ok(); + }); RETURN_NOT_OK(st); } } @@ -757,7 +758,9 @@ std::string WriterBase::coords_to_str(uint64_t i) const { } Status WriterBase::create_fragment( - bool dense, shared_ptr& frag_meta) { + bool dense, + shared_ptr& frag_meta, + const NDRange* domain) { // Get write version, timestamp array was opened, and a reference to the // array directory. auto write_version = array_->array_schema_latest().write_version(); @@ -787,18 +790,21 @@ Status WriterBase::create_fragment( has_timestamps, has_delete_metadata); - frag_meta->init(subarray_.ndrange(0)); + frag_meta->init(domain ? *domain : subarray_.ndrange(0)); return Status::Ok(); } Status WriterBase::filter_tiles( + uint64_t start_tile_idx, + uint64_t end_tile_idx, tdb::pmr::unordered_map* tiles) { auto timer_se = stats_->start_timer("filter_tiles"); auto status = parallel_for(&resources_.compute_tp(), 0, tiles->size(), [&](uint64_t i) { auto tiles_it = tiles->begin(); std::advance(tiles_it, i); - throw_if_not_ok(filter_tiles(tiles_it->first, &tiles_it->second)); + throw_if_not_ok(filter_tiles( + start_tile_idx, end_tile_idx, tiles_it->first, &tiles_it->second)); throw_if_cancelled(); return Status::Ok(); }); @@ -808,7 +814,10 @@ Status WriterBase::filter_tiles( } Status WriterBase::filter_tiles( - const std::string& name, WriterTileTupleVector* tiles) { + uint64_t start_tile_idx, + uint64_t end_tile_idx, + const std::string& name, + WriterTileTupleVector* tiles) { const bool var_size = array_schema_.var_size(name); const bool nullable = array_schema_.is_nullable(name); @@ -818,7 +827,8 @@ Status WriterBase::filter_tiles( // Process all tiles, minus offsets, they get processed separately. std::vector> args; args.reserve(tile_num * (1 + nullable)); - for (auto& tile : *tiles) { + for (uint64_t t = start_tile_idx; t < end_tile_idx; t++) { + auto& tile = (*tiles)[t]; if (var_size) { args.emplace_back(&tile.var_tile(), &tile.offset_tile(), false, false); } else { diff --git a/tiledb/sm/query/writers/writer_base.h b/tiledb/sm/query/writers/writer_base.h index 4f085c1b7c8..b7304903002 100644 --- a/tiledb/sm/query/writers/writer_base.h +++ b/tiledb/sm/query/writers/writer_base.h @@ -242,14 +242,27 @@ class WriterBase : public StrategyBase, public IQueryStrategy { /** * Computes the MBRs. * + * @param start_tile_idx The index of the first tile to compute MBR for + * @param end_tile_idx The index of the last tile to compute MBR for * @param tiles The tiles to calculate the MBRs from. It is a map of vectors, * one vector of tiles per dimension/coordinates. * @return MBRs. */ std::vector compute_mbrs( + uint64_t start_tile_idx, + uint64_t end_tile_idx, const tdb::pmr::unordered_map& tiles) const; + /** + * Computes the MBRs for all of the requested tiles. See above. + */ + std::vector compute_mbrs( + const tdb::pmr::unordered_map& tiles) + const { + return compute_mbrs(0, tiles.begin()->second.size(), tiles); + } + /** * Set the coordinates metadata (e.g., MBRs). * @@ -270,15 +283,26 @@ class WriterBase : public StrategyBase, public IQueryStrategy { /** * Computes the tiles metadata (min/max/sum/null count). * - * @param tile_num The number of tiles. + * @param start_tile_idx The index of the first tile to compute metadata for + * @param end_tile_idx The index of the last tile to compute metadata for * @param tiles The tiles to calculate the tile metadata from. It is * a map of vectors, one vector of tiles per dimension. * @return Status */ Status compute_tiles_metadata( - uint64_t tile_num, + uint64_t start_tile_idx, + uint64_t end_tile_idx, tdb::pmr::unordered_map& tiles) const; + /** + * Computes the tiles metadata for each tile in the provided list. See above. + */ + Status compute_tiles_metadata( + tdb::pmr::unordered_map& tiles) + const { + return compute_tiles_metadata(0, tiles.begin()->second.size(), tiles); + } + /** * Returns the i-th coordinates in the coordinate buffers in string * format. @@ -293,27 +317,51 @@ class WriterBase : public StrategyBase, public IQueryStrategy { * * @param dense Whether the fragment is dense or not. * @param frag_meta The fragment metadata to be generated. + * @param domain Optional domain for the fragment, uses subarray 0th range if + * not provided * @return Status */ - Status create_fragment(bool dense, shared_ptr& frag_meta); + Status create_fragment( + bool dense, + shared_ptr& frag_meta, + const NDRange* domain = nullptr); /** * Runs the input coordinate and attribute tiles through their * filter pipelines. The tile buffers are modified to contain the output * of the pipeline. + * + * @param start_tile_idx The index of the first tile to filter + * @param end_tile_idx The index of the last tile to filter */ Status filter_tiles( + uint64_t start_tile_idx, + uint64_t end_tile_idx, tdb::pmr::unordered_map* tiles); + /** + * See above, filtering all of the provided tiles. + */ + Status filter_tiles( + tdb::pmr::unordered_map* tiles) { + return filter_tiles(0, tiles->begin()->second.size(), tiles); + } + /** * Runs the input tiles for the input attribute through the filter pipeline. * The tile buffers are modified to contain the output of the pipeline. * + * @param start_tile_idx The index of the first tile to filter + * @param end_tile_idx The index of the last tile to filter * @param name The attribute/dimension the tiles belong to. * @param tile The tiles to be filtered. * @return Status */ - Status filter_tiles(const std::string& name, WriterTileTupleVector* tiles); + Status filter_tiles( + uint64_t start_tile_idx, + uint64_t end_tile_idx, + const std::string& name, + WriterTileTupleVector* tiles); /** * Runs the input tile for the input attribute/dimension through the filter diff --git a/tiledb/sm/tile/arithmetic.h b/tiledb/sm/tile/arithmetic.h new file mode 100644 index 00000000000..58882132d2c --- /dev/null +++ b/tiledb/sm/tile/arithmetic.h @@ -0,0 +1,228 @@ +/** + * @file tiledb/sm/tile/arithmetic.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file provides template definitions for doing tile arithmetic, + * e.g. computing new domains based on offsets and such. + * + * Definitions: + * + * **Hyperrectangle**: + * The generalization of a rectangle to higher dimensions. + * This is a standard term from mathematical literature. + * + * **Hyperrow**: + * The generalization of a row to higher dimensions. + * This does not appear to be a standard term from mathematical literature. + * A row in a 2D domain is a rectangle of height 1, i.e. spanning a single + * coordinate of the outermost "row" dimension. So, in a higher-dimensional + * plane, a hyperrow is a hyperrectangle which spans a single coordinate of the + * outermost dimension. For example, in a 3D domain a hyperrow is a plane. + */ +#ifndef TILEDB_TILE_ARITHMETIC_H +#define TILEDB_TILE_ARITHMETIC_H + +#include "tiledb/common/arithmetic.h" +#include "tiledb/sm/array_schema/dimension.h" +#include "tiledb/sm/enums/layout.h" +#include "tiledb/sm/misc/types.h" +#include "tiledb/type/range/range.h" + +namespace tiledb::sm { + +/** + * Ternary value for the result of `is_rectangular_domain`. + * Describes whether a `[start_tile, start_tile + num_tiles)` range + * over a given domain forms a rectangle. + */ +enum class IsRectangularDomain { + /** The range is not a rectangle, but extending it could create one. */ + No, + /** The range is not a rectangle, and extending it can never create one. */ + Never, + /** The range is a rectangle. */ + Yes +}; + +/** + * @return true if the range `[start_tile, start_tile + num_tiles)` represents + * a hyper-rectangle inside `domain` with tile sizes given by `tile_extents` + */ +template +static IsRectangularDomain is_rectangular_domain( + Layout tile_order, + std::span tile_extents, + const sm::NDRange& domain, + uint64_t start_tile, + uint64_t num_tiles) { + for (uint64_t d_outer = 0; d_outer < tile_extents.size(); d_outer++) { + uint64_t hyperrow_num_tiles = 1; + for (uint64_t d_inner = d_outer + 1; d_inner < tile_extents.size(); + d_inner++) { + const uint64_t d = + (tile_order == Layout::ROW_MAJOR ? d_inner : + tile_extents.size() - d_inner - 1); + const uint64_t d_inner_num_tiles = + sm::Dimension::tile_idx( + domain[d].end_as(), domain[d].start_as(), tile_extents[d]) + + 1; + + const auto maybe = checked_arithmetic::mul( + hyperrow_num_tiles, d_inner_num_tiles); + if (maybe.has_value()) { + hyperrow_num_tiles = maybe.value(); + } else { + throw std::overflow_error( + "Cannot compute subrectangle of domain due to arithmetic overflow: " + "domain tile extents may be too large"); + } + } + + const uint64_t hyperrow_offset = start_tile % hyperrow_num_tiles; + if (hyperrow_offset + num_tiles > hyperrow_num_tiles) { + if (hyperrow_offset != 0) { + return IsRectangularDomain::Never; + } else if (num_tiles % hyperrow_num_tiles != 0) { + return IsRectangularDomain::No; + } + } + } + return IsRectangularDomain::Yes; +} + +/** + * Compute the number of tiles per hyperrow for the given `domain` with tiles + * given by `tile_extents`. + * + * For D dimensions, the returned vector contains `D+1` elements. + * Position 0 is the number of tiles in `domain`. + * For dimension `d`, position `d + 1` is the number of tiles in a hyperrow of + * dimension `d` (and is thus always 1 for the final dimension). + */ +template +std::vector> compute_hyperrow_sizes( + Layout tile_order, + std::span tile_extents, + const sm::NDRange& domain) { + std::vector> hyperrow_sizes( + tile_extents.size() + 1, 1); + for (uint64_t di = 0; di < tile_extents.size(); di++) { + const uint64_t d = + (tile_order == Layout::ROW_MAJOR ? di : tile_extents.size() - di - 1); + const uint64_t d_num_tiles = + sm::Dimension::tile_idx( + domain[d].end_as(), domain[d].start_as(), tile_extents[d]) + + 1; + hyperrow_sizes[di] = d_num_tiles; + } + for (uint64_t d = tile_extents.size(); d > 0; d--) { + if (hyperrow_sizes[d - 1].has_value() && hyperrow_sizes[d].has_value()) { + hyperrow_sizes[d - 1] = checked_arithmetic::mul( + hyperrow_sizes[d - 1].value(), hyperrow_sizes[d].value()); + } else { + hyperrow_sizes[d - 1] = std::nullopt; + } + } + + return hyperrow_sizes; +} + +/** + * @return a new range which is contained the rectangle within `domain` defined + * by `[start_tile, start_tile + num_tiles)` for the tile sizes given by + * `tile_extents`. If this does not represent a valid rectangle then + * `std::nullopt` is returned instead. + */ +template +static std::optional domain_tile_offset( + Layout tile_order, + std::span tile_extents, + const sm::NDRange& domain, + uint64_t start_tile, + uint64_t num_tiles) { + sm::NDRange r; + r.resize(tile_extents.size()); + + const std::vector> dimension_sizes = + compute_hyperrow_sizes(tile_order, tile_extents, domain); + + for (uint64_t di = 0; di < tile_extents.size(); di++) { + const uint64_t d = + (tile_order == Layout::ROW_MAJOR ? di : tile_extents.size() - di - 1); + + if (!dimension_sizes[di + 1].has_value()) { + throw std::overflow_error( + "Cannot compute subrectangle of domain due to arithmetic overflow: " + "domain tile extents may be too large"); + } + const uint64_t hyperrow_num_tiles = dimension_sizes[di + 1].value(); + + T this_dimension_start_tile, this_dimension_end_tile; + if (dimension_sizes[di].has_value()) { + const uint64_t outer_num_tiles = dimension_sizes[di].value(); + this_dimension_start_tile = (start_tile / hyperrow_num_tiles) % + (outer_num_tiles / hyperrow_num_tiles); + this_dimension_end_tile = + ((start_tile + num_tiles - 1) / hyperrow_num_tiles) % + (outer_num_tiles / hyperrow_num_tiles); + } else { + this_dimension_start_tile = start_tile / hyperrow_num_tiles; + this_dimension_end_tile = + (start_tile + num_tiles - 1) / hyperrow_num_tiles; + } + + if (start_tile % hyperrow_num_tiles == 0) { + // aligned to the start of the hyperrow + if (num_tiles > hyperrow_num_tiles && + num_tiles % hyperrow_num_tiles != 0) { + return std::nullopt; + } + } else { + // begins in the middle of the hyperrow + const uint64_t offset = start_tile % hyperrow_num_tiles; + if (offset + num_tiles > hyperrow_num_tiles) { + return std::nullopt; + } + } + + const T start = + domain[d].start_as() + (this_dimension_start_tile * tile_extents[d]); + const T end = domain[d].start_as() + + (this_dimension_end_tile * tile_extents[d]) + + tile_extents[d] - 1; + r[d] = Range( + std::max(domain[d].start_as(), start), + std::min(domain[d].end_as(), end)); + } + + return r; +} + +} // namespace tiledb::sm + +#endif diff --git a/tiledb/sm/tile/test/CMakeLists.txt b/tiledb/sm/tile/test/CMakeLists.txt index 6feeb4eafb1..bb06dbdcb41 100644 --- a/tiledb/sm/tile/test/CMakeLists.txt +++ b/tiledb/sm/tile/test/CMakeLists.txt @@ -29,7 +29,10 @@ include(unit_test) commence(unit_test tile) this_target_sources( main.cc + unit_arithmetic.cc unit_tile.cc + ${CMAKE_SOURCE_DIR}/test/support/rapidcheck/show/array_schema_templates.cc ) this_target_object_libraries(tile mem_helpers) + this_target_link_libraries(rapidcheck) conclude(unit_test) diff --git a/tiledb/sm/tile/test/arithmetic.h b/tiledb/sm/tile/test/arithmetic.h new file mode 100644 index 00000000000..659fb5329ec --- /dev/null +++ b/tiledb/sm/tile/test/arithmetic.h @@ -0,0 +1,95 @@ +/** + * @file tiledb/sm/tile/arithmetic.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file provides template definitions for functions which are + * used to test tile arithmetic. + */ +#ifndef TILEDB_TILE_TEST_ARITHMETIC_H +#define TILEDB_TILE_TEST_ARITHMETIC_H + +#include "tiledb/sm/tile/arithmetic.h" + +namespace tiledb::test { + +/** + * @return the number of tiles in `subrectangle` based on the tile sizes in + * `tile_extents` + */ +template +uint64_t compute_num_tiles( + std::span tile_extents, const sm::NDRange& subrectangle) { + uint64_t num_tiles_result = 1; + for (uint64_t d = 0; d < tile_extents.size(); d++) { + const uint64_t num_tiles_this_dimension = sm::Dimension::tile_idx( + subrectangle[d].end_as(), + subrectangle[d].start_as(), + tile_extents[d]) + + 1; + num_tiles_result *= num_tiles_this_dimension; + } + + return num_tiles_result; +} + +/** + * @return the tile offset of `subrectangle` within `domain` based on the tile + * sizes in `tile_extents` + */ +template +std::optional compute_start_tile( + sm::Layout tile_order, + std::span tile_extents, + const sm::NDRange& domain, + const sm::NDRange& subrectangle) { + const std::vector> hyperrow_sizes = + sm::compute_hyperrow_sizes(tile_order, tile_extents, domain); + + uint64_t start_tile_result = 0; + for (uint64_t di = 0; di < tile_extents.size(); di++) { + const uint64_t d = + (tile_order == sm::Layout::ROW_MAJOR ? di : + tile_extents.size() - di - 1); + const uint64_t start_tile_this_dimension = sm::Dimension::tile_idx( + subrectangle[d].start_as(), + domain[d].start_as(), + tile_extents[d]); + if (hyperrow_sizes[di + 1].has_value()) { + start_tile_result += + start_tile_this_dimension * hyperrow_sizes[di + 1].value(); + } else { + return std::nullopt; + } + } + + return start_tile_result; +} + +} // namespace tiledb::test + +#endif diff --git a/tiledb/sm/tile/test/unit_arithmetic.cc b/tiledb/sm/tile/test/unit_arithmetic.cc new file mode 100644 index 00000000000..333762dd399 --- /dev/null +++ b/tiledb/sm/tile/test/unit_arithmetic.cc @@ -0,0 +1,756 @@ +#include +#include +#include "test/support/rapidcheck/array_schema_templates.h" +#include "test/support/src/array_schema_templates.h" +#include "tiledb/sm/array_schema/dimension.h" +#include "tiledb/sm/misc/types.h" +#include "tiledb/sm/tile/arithmetic.h" +#include "tiledb/sm/tile/test/arithmetic.h" +#include "tiledb/type/range/range.h" + +#include + +using namespace tiledb; +using namespace sm; +using namespace tiledb::test; + +template +static IsRectangularDomain is_rectangular_domain( + std::span tile_extents, + T lower_bound, + T upper_bound, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + sm::NDRange r; + r.push_back(Range(lower_bound, upper_bound)); + return is_rectangular_domain( + tile_order, tile_extents, r, start_tile, num_tiles); +} + +template +static IsRectangularDomain is_rectangular_domain( + std::span tile_extents, + T d1_lower_bound, + T d1_upper_bound, + T d2_lower_bound, + T d2_upper_bound, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + sm::NDRange r; + r.push_back(Range(d1_lower_bound, d1_upper_bound)); + r.push_back(Range(d2_lower_bound, d2_upper_bound)); + return is_rectangular_domain( + tile_order, tile_extents, r, start_tile, num_tiles); +} + +template +static IsRectangularDomain is_rectangular_domain( + const templates::Dimension
& d1, + const templates::Dimension
& d2, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + using Coord = templates::Dimension
::value_type; + const std::vector extents = {d1.extent, d2.extent}; + return is_rectangular_domain( + extents, + d1.domain.lower_bound, + d1.domain.upper_bound, + d2.domain.lower_bound, + d2.domain.upper_bound, + start_tile, + num_tiles, + tile_order); +} + +template +static IsRectangularDomain is_rectangular_domain( + const templates::Dimension
& d1, + const templates::Dimension
& d2, + const templates::Dimension
& d3, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + using Coord = templates::Dimension
::value_type; + const std::vector extents = {d1.extent, d2.extent, d3.extent}; + sm::NDRange r; + r.push_back(Range(d1.domain.lower_bound, d1.domain.upper_bound)); + r.push_back(Range(d2.domain.lower_bound, d2.domain.upper_bound)); + r.push_back(Range(d3.domain.lower_bound, d3.domain.upper_bound)); + return is_rectangular_domain( + tile_order, extents, r, start_tile, num_tiles); +} + +// in one dimension all domains are rectangles +TEST_CASE("is_rectangular_domain 1d", "[arithmetic]") { + rc::prop( + "is_rectangular_domain 1d", + [](templates::Dimension dimension) { + const uint64_t start_tile = + *rc::gen::inRange(0, dimension.num_tiles()); + const uint64_t num_tiles = + *rc::gen::inRange(1, dimension.num_tiles() - start_tile); + + const std::vector extents = {dimension.extent}; + RC_ASSERT( + is_rectangular_domain( + extents, + dimension.domain.lower_bound, + dimension.domain.upper_bound, + start_tile, + num_tiles) == IsRectangularDomain::Yes); + }); +} + +TEST_CASE("is_rectangular_domain 2d", "[arithmetic]") { + /* + * Domain is a 16x16 square + */ + SECTION("Square") { + const uint64_t d1_lower = GENERATE(0, 3); + const uint64_t d1_upper = d1_lower + 16 - 1; + const uint64_t d2_lower = GENERATE(0, 3); + const uint64_t d2_upper = d2_lower + 16 - 1; + + SECTION("Row tiles") { + const std::vector extents = {1, 16}; + for (uint64_t start_tile = 0; start_tile < 15; start_tile++) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= 16; + num_tiles++) { + CAPTURE(start_tile, num_tiles); + CHECK( + is_rectangular_domain( + extents, + d1_lower, + d1_upper, + d2_lower, + d2_upper, + start_tile, + num_tiles) == IsRectangularDomain::Yes); + } + } + } + + SECTION("Square tiles") { + // 7x7 tiles will subdivide the 16x16 square into 3x3 tiles + const std::vector extents = {7, 7}; + + auto tt = [&](uint64_t start_tile, + uint64_t num_tiles) -> IsRectangularDomain { + return is_rectangular_domain( + extents, + d1_lower, + d1_upper, + d2_lower, + d2_upper, + start_tile, + num_tiles); + }; + + // tiles aligned with the start: rectangle formed if less than one row, or + // integral number of rows + for (uint64_t start_tile : {0, 3, 6}) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= 9; num_tiles++) { + CAPTURE(start_tile, num_tiles); + if (num_tiles < 3 || num_tiles % 3 == 0) { + CHECK(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else { + CHECK(tt(start_tile, num_tiles) == IsRectangularDomain::No); + } + } + } + + // otherwise a rectangle is only formed within the same row + for (uint64_t start_tile : {1, 2, 4, 5, 7, 8}) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= 9; num_tiles++) { + CAPTURE(start_tile, num_tiles); + if ((start_tile % 3) + num_tiles <= 3) { + CHECK(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else { + CHECK(tt(start_tile, num_tiles) == IsRectangularDomain::Never); + } + } + } + } + } + + using Dim64 = templates::Dimension; + + auto instance_is_rectangular_domain_2d = + [](Dim64 d1, Dim64 d2) { + const std::vector extents = {d1.extent, d2.extent}; + auto tt = [&](uint64_t start_tile, + uint64_t num_tiles) -> IsRectangularDomain { + return is_rectangular_domain(d1, d2, start_tile, num_tiles); + }; + + const uint64_t total_tiles = d1.num_tiles() * d2.num_tiles(); + + for (uint64_t t = 0; t < d1.num_tiles(); t += d2.num_tiles()) { + // row-aligned tiles + for (uint64_t num_tiles = 1; t + num_tiles <= total_tiles; + num_tiles++) { + if (num_tiles <= d2.num_tiles() || + num_tiles % d2.num_tiles() == 0) { + ASSERTER(tt(t, num_tiles) == IsRectangularDomain::Yes); + } else { + ASSERTER(tt(t, num_tiles) == IsRectangularDomain::No); + } + } + // other tiles + for (uint64_t o = 1; t + o < d2.num_tiles(); o++) { + for (uint64_t num_tiles = 1; t + o + num_tiles <= total_tiles; + num_tiles++) { + if (((t + o) % d2.num_tiles()) + num_tiles <= d2.num_tiles()) { + ASSERTER(tt(t + o, num_tiles) == IsRectangularDomain::Yes); + } else { + ASSERTER(tt(t + o, num_tiles) == IsRectangularDomain::Never); + } + } + } + } + }; + + SECTION("Shrinking") { + instance_is_rectangular_domain_2d(Dim64(0, 2, 1), Dim64(0, 0, 1)); + instance_is_rectangular_domain_2d(Dim64(0, 2, 1), Dim64(0, 1, 1)); + } + + rc::prop("is_rectangular_domain 2d", [&]() { + Dim64 d1 = *rc::make_dimension(std::nullopt, {64}); + Dim64 d2 = *rc::make_dimension(std::nullopt, {64}); + instance_is_rectangular_domain_2d.operator()(d1, d2); + }); +} + +TEST_CASE("is_rectangular_domain 3d", "[arithmetic]") { + using Dim64 = templates::Dimension; + + /** + * 3D plane tiles (where the outermost dimension has extent 1) + * should produce the same results as rectangular tiles in the plane + */ + rc::prop("plane tiles", [&]() { + Dim64 d1 = *rc::make_dimension(std::nullopt, {1}); + Dim64 d2 = *rc::make_dimension(std::nullopt, {32}); + Dim64 d3 = *rc::make_dimension(std::nullopt, {32}); + + const uint64_t total_tiles = + d1.num_tiles() * d2.num_tiles() * d3.num_tiles(); + for (uint64_t start_tile = 0; start_tile < total_tiles; start_tile++) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= total_tiles; + num_tiles++) { + const IsRectangularDomain rectangle = + is_rectangular_domain(d2, d3, start_tile, num_tiles); + const IsRectangularDomain plane = + is_rectangular_domain(d1, d2, d3, start_tile, num_tiles); + + RC_ASSERT(rectangle == plane); + } + } + }); + + /** + * Runs over the possible `(start_tiles, num_tiles)` pairs for dimensions + * `{d1, d2, d3}` and asserts that `is_rectangular_domain` returns true if and + * only if the pair represents an expected rectangle. + */ + auto instance_is_rectangular_domain_3d = []( + Dim64 d1, Dim64 d2, Dim64 d3) { + auto tt = [&](uint64_t start_tile, + uint64_t num_tiles) -> IsRectangularDomain { + return is_rectangular_domain(d1, d2, d3, start_tile, num_tiles); + }; + + const uint64_t total_tiles = + d1.num_tiles() * d2.num_tiles() * d3.num_tiles(); + const uint64_t plane_tiles = d2.num_tiles() * d3.num_tiles(); + + for (uint64_t start_tile = 0; start_tile < total_tiles; start_tile++) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= total_tiles; + num_tiles++) { + if (start_tile % plane_tiles == 0) { + // aligned to a plane, several options to be a rectangle + if (num_tiles <= d3.num_tiles()) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else if ( + num_tiles <= plane_tiles && num_tiles % d3.num_tiles() == 0) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else if (num_tiles % plane_tiles == 0) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::No); + } + } else if (start_tile % d3.num_tiles() == 0) { + // aligned to a row within a plane, but not aligned to the plane + // this is a rectangle if it is an integral number of rows, or + // fits within a row + if (num_tiles <= d3.num_tiles()) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else if ( + num_tiles % d3.num_tiles() == 0 && + (start_tile % plane_tiles) + num_tiles <= plane_tiles) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else if ((start_tile % plane_tiles) + num_tiles <= plane_tiles) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::No); + } else { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Never); + } + } else { + // unaligned, only a rectangle if it doesn't advance rows + if (start_tile % d3.num_tiles() + num_tiles <= d3.num_tiles()) { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Yes); + } else { + ASSERTER(tt(start_tile, num_tiles) == IsRectangularDomain::Never); + } + } + } + } + }; + + SECTION("Shrinking") { + instance_is_rectangular_domain_3d( + Dim64(0, 1, 1), Dim64(0, 0, 1), Dim64(0, 1, 1)); + instance_is_rectangular_domain_3d( + Dim64(0, 1, 1), Dim64(0, 2, 1), Dim64(0, 0, 1)); + } + + rc::prop("any tiles", [&]() { + const Dim64 d1 = + *rc::make_dimension(std::nullopt, {16}); + const Dim64 d2 = + *rc::make_dimension(std::nullopt, {16}); + const Dim64 d3 = + *rc::make_dimension(std::nullopt, {16}); + + instance_is_rectangular_domain_3d.operator()( + d1, d2, d3); + }); +} + +template +std::optional instance_domain_tile_offset( + std::span tile_extents, + const sm::NDRange& domain, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + const IsRectangularDomain expect_rectangle = is_rectangular_domain( + tile_order, tile_extents, domain, start_tile, num_tiles); + const std::optional adjusted_domain = domain_tile_offset( + tile_order, tile_extents, domain, start_tile, num_tiles); + if (expect_rectangle != IsRectangularDomain::Yes) { + ASSERTER(!adjusted_domain.has_value()); + return std::nullopt; + } + + ASSERTER(adjusted_domain.has_value()); + + const uint64_t num_tiles_result = + compute_num_tiles(tile_extents, adjusted_domain.value()); + ASSERTER(num_tiles_result == num_tiles); + + const std::optional start_tile_result = compute_start_tile( + tile_order, tile_extents, domain, adjusted_domain.value()); + ASSERTER(start_tile_result == start_tile); + + return adjusted_domain; +} + +template +void instance_domain_tile_offset( + std::span tile_extents, + const sm::NDRange& domain, + Layout tile_order = Layout::ROW_MAJOR) { + uint64_t total_tiles = 1; + for (uint64_t d = 0; d < tile_extents.size(); d++) { + const uint64_t num_tiles_this_dimension = + sm::Dimension::tile_idx( + domain[d].end_as(), domain[d].start_as(), tile_extents[d]) + + 1; + total_tiles *= num_tiles_this_dimension; + } + for (uint64_t start_tile = 0; start_tile < total_tiles; start_tile++) { + for (uint64_t num_tiles = 1; start_tile + num_tiles <= total_tiles; + num_tiles++) { + instance_domain_tile_offset( + tile_extents, domain, start_tile, num_tiles, tile_order); + } + } +} + +template +std::optional::value_type>>> +instance_domain_tile_offset( + const std::vector>& dims, + uint64_t start_tile, + uint64_t num_tiles, + Layout tile_order = Layout::ROW_MAJOR) { + using Coord = typename templates::Dimension
::value_type; + + std::vector tile_extents; + for (const auto& dim : dims) { + tile_extents.push_back(dim.extent); + } + + sm::NDRange domain; + for (const auto& dim : dims) { + domain.push_back(Range(dim.domain.lower_bound, dim.domain.upper_bound)); + } + + const auto range = instance_domain_tile_offset( + tile_extents, domain, start_tile, num_tiles, tile_order); + if (!range.has_value()) { + return std::nullopt; + } + + std::vector> typed_range; + for (const auto& r : range.value()) { + typed_range.emplace_back( + r.template start_as(), r.template end_as()); + } + return typed_range; +} + +template +void instance_domain_tile_offset( + const std::vector>& dims, + Layout tile_order = Layout::ROW_MAJOR) { + using Coord = templates::Dimension
::value_type; + + std::vector tile_extents; + for (const auto& dim : dims) { + tile_extents.push_back(dim.extent); + } + + sm::NDRange domain; + for (const auto& dim : dims) { + domain.push_back(Range(dim.domain.lower_bound, dim.domain.upper_bound)); + } + + instance_domain_tile_offset( + tile_extents, domain, tile_order); +} + +TEST_CASE("domain_tile_offset 1d", "[arithmetic]") { + using Dim64 = templates::Dimension; + + SECTION("Shrinking") { + instance_domain_tile_offset({Dim64(0, 18, 5)}); + } + + rc::prop("any tiles", []() { + const Dim64 d1 = *rc::make_dimension(std::nullopt, {128}); + + instance_domain_tile_offset({d1}); + }); +} + +TEST_CASE("domain_tile_offset 2d", "[arithmetic]") { + using Dim64 = templates::Dimension; + using Dom64 = Dim64::domain_type; + + SECTION("Rectangle examples") { + const uint64_t d1_lower_bound = GENERATE(0, 3); + const uint64_t d1_extent = GENERATE(1, 4); + const uint64_t d2_lower_bound = GENERATE(0, 3); + const uint64_t d2_extent = GENERATE(1, 4); + + const Dim64 d1( + d1_lower_bound, d1_lower_bound + (5 * d1_extent) - 1, d1_extent); + const Dim64 d2( + d2_lower_bound, d2_lower_bound + (4 * d2_extent) - 1, d2_extent); + + auto make_d1 = [&](uint64_t r_start, uint64_t r_end) { + return Dom64( + d1_lower_bound + r_start * d1_extent, + d1_lower_bound + r_end * d1_extent + d1_extent - 1); + }; + auto make_d2 = [&](uint64_t c_start, uint64_t c_end) { + return Dom64( + d2_lower_bound + c_start * d2_extent, + d2_lower_bound + c_end * d2_extent + d2_extent - 1); + }; + + SECTION("Whole domain") { + const Layout tile_order = GENERATE(Layout::ROW_MAJOR, Layout::COL_MAJOR); + const auto r = instance_domain_tile_offset( + {d1, d2}, 0, 20, tile_order); + CHECK(r == std::vector{d1.domain, d2.domain}); + } + + SECTION("Sub-rectangle") { + const auto r1 = + instance_domain_tile_offset({d1, d2}, 4, 8); + CHECK(r1 == std::vector{make_d1(1, 2), d2.domain}); + + const auto r2 = + instance_domain_tile_offset({d1, d2}, 8, 4); + CHECK(r2 == std::vector{make_d1(2, 2), d2.domain}); + + const auto r3 = + instance_domain_tile_offset({d1, d2}, 8, 12); + CHECK(r3 == std::vector{make_d1(2, 4), d2.domain}); + } + + SECTION("Line") { + const auto r1 = + instance_domain_tile_offset({d1, d2}, 0, 2); + CHECK(r1 == std::vector{make_d1(0, 0), make_d2(0, 1)}); + + const auto r2 = + instance_domain_tile_offset({d1, d2}, 1, 2); + CHECK( + r2 == std::vector{ + make_d1(0, 0), + make_d2(1, 2), + }); + + const auto r3 = + instance_domain_tile_offset({d1, d2}, 9, 3); + CHECK(r3 == std::vector{make_d1(2, 2), make_d2(1, 3)}); + } + + SECTION("Align start but not end") { + const auto r1 = + instance_domain_tile_offset({d1, d2}, 0, 5); + CHECK(r1 == std::optional>{}); + + const auto r2 = + instance_domain_tile_offset({d1, d2}, 4, 11); + CHECK(r2 == std::optional>{}); + } + + SECTION("Cross row") { + const auto r1 = + instance_domain_tile_offset({d1, d2}, 7, 2); + CHECK(r1 == std::optional>{}); + + const auto r2 = + instance_domain_tile_offset({d1, d2}, 5, 4); + CHECK(r2 == std::optional>{}); + + const auto r3 = + instance_domain_tile_offset({d1, d2}, 5, 8); + CHECK(r3 == std::optional>{}); + } + + SECTION("Column major") { + const auto r1 = instance_domain_tile_offset( + {d1, d2}, 0, 10, Layout::COL_MAJOR); + CHECK(r1 == std::vector{d1.domain, make_d2(0, 1)}); + + const auto r2 = instance_domain_tile_offset( + {d1, d2}, 11, 4, Layout::COL_MAJOR); + CHECK(r2 == std::vector{make_d1(1, 4), make_d2(2, 2)}); + + const auto r3 = instance_domain_tile_offset( + {d1, d2}, 11, 5, Layout::COL_MAJOR); + CHECK(r3 == std::optional>{}); + } + } + + SECTION("CORE-290 Example") { + const Dim64 row(0, std::numeric_limits::max() - 1, 4); + const Dim64 col(0, 99999, 100000 / row.extent); + + auto make_row = [&](uint64_t r_start, uint64_t r_end) { + return Dom64( + row.domain.lower_bound + r_start * row.extent, + row.domain.lower_bound + r_end * row.extent + row.extent - 1); + }; + + const auto r1 = instance_domain_tile_offset( + {row, col}, 0, 4, Layout::ROW_MAJOR); + CHECK(r1 == std::vector{make_row(0, 0), col.domain}); + } + + SECTION("Hyperrow overflow") { + const uint64_t target_tiles_in_domain = 1 << 16; + const uint64_t lower_bound = 0; + const uint64_t upper_bound = std::numeric_limits::max() - 1; + const uint64_t extent = + (upper_bound - lower_bound + 1) / target_tiles_in_domain; + const Dim64 d(lower_bound, upper_bound, extent); + + SECTION("Not overflow") { + const auto r = instance_domain_tile_offset( + {d, d, d, d}, 0, 1, Layout::ROW_MAJOR); + CHECK( + r == std::vector{ + Dom64(0, extent - 1), + Dom64(0, extent - 1), + Dom64(0, extent - 1), + Dom64(0, extent - 1)}); + } + + SECTION("Overflow") { + const auto expect = Catch::Matchers::ContainsSubstring( + "Fragment size is too small to subdivide dense subarray into " + "multiple fragments"); + REQUIRE_THROWS( + instance_domain_tile_offset( + {d, d, d, d, d}, 0, 1, Layout::ROW_MAJOR), + expect); + } + } + + rc::prop("any tiles", []() { + const Dim64 d1 = *rc::make_dimension(std::nullopt, {64}); + const Dim64 d2 = *rc::make_dimension(std::nullopt, {64}); + const Layout tile_order = + *rc::gen::element(Layout::ROW_MAJOR, Layout::COL_MAJOR); + + instance_domain_tile_offset( + {d1, d2}, tile_order); + }); +} + +TEST_CASE("domain_tile_offset 3d", "[arithmetic]") { + using Dim64 = templates::Dimension; + using Dom64 = Dim64::domain_type; + + SECTION("Rectangular prism examples") { + const uint64_t d1_lower_bound = GENERATE(0, 3); + const uint64_t d1_extent = GENERATE(1, 4); + const uint64_t d2_lower_bound = GENERATE(0, 3); + const uint64_t d2_extent = GENERATE(1, 4); + const uint64_t d3_lower_bound = GENERATE(0, 3); + const uint64_t d3_extent = GENERATE(1, 4); + + const Dim64 d1( + d1_lower_bound, d1_lower_bound + (3 * d1_extent) - 1, d1_extent); + const Dim64 d2( + d2_lower_bound, d2_lower_bound + (6 * d2_extent) - 1, d2_extent); + const Dim64 d3( + d3_lower_bound, d3_lower_bound + (7 * d3_extent) - 1, d3_extent); + + auto make_d1 = [&](uint64_t h_start, uint64_t h_end) { + return Dom64( + d1_lower_bound + h_start * d1_extent, + d1_lower_bound + h_end * d1_extent + d1_extent - 1); + }; + auto make_d2 = [&](uint64_t w_start, uint64_t w_end) { + return Dom64( + d2_lower_bound + w_start * d2_extent, + d2_lower_bound + w_end * d2_extent + d2_extent - 1); + }; + auto make_d3 = [&](uint64_t l_start, uint64_t l_end) { + return Dom64( + d3_lower_bound + l_start * d3_extent, + d3_lower_bound + l_end * d3_extent + d3_extent - 1); + }; + + SECTION("Whole domain") { + const Layout tile_order = GENERATE(Layout::ROW_MAJOR, Layout::COL_MAJOR); + const auto r = instance_domain_tile_offset( + {d1, d2, d3}, + 0, + d1.num_tiles() * d2.num_tiles() * d3.num_tiles(), + tile_order); + CHECK(r == std::vector{d1.domain, d2.domain, d3.domain}); + } + + SECTION("Plane") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 0, 42); + CHECK(r1 == std::vector{make_d1(0, 0), d2.domain, d3.domain}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 42, 42); + CHECK(r2 == std::vector{make_d1(1, 1), d2.domain, d3.domain}); + + const auto r3 = + instance_domain_tile_offset({d1, d2, d3}, 84, 42); + CHECK(r3 == std::vector{make_d1(2, 2), d2.domain, d3.domain}); + } + + SECTION("Rectangle") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 0, 14); + CHECK(r1 == std::vector{make_d1(0, 0), make_d2(0, 1), d3.domain}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 70, 14); + CHECK(r2 == std::vector{make_d1(1, 1), make_d2(4, 5), d3.domain}); + } + + SECTION("Line") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 0, 4); + CHECK( + r1 == + std::vector{make_d1(0, 0), make_d2(0, 0), make_d3(0, 3)}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 8, 2); + CHECK( + r2 == + std::vector{make_d1(0, 0), make_d2(1, 1), make_d3(1, 2)}); + + const auto r3 = + instance_domain_tile_offset({d1, d2, d3}, 109, 3); + CHECK( + r3 == + std::vector{make_d1(2, 2), make_d2(3, 3), make_d3(4, 6)}); + } + + SECTION("Align start but not end") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 0, 43); + CHECK(r1 == std::optional>{}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 42, 125); + CHECK(r2 == std::optional>{}); + } + + SECTION("Cross row") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 0, 8); + CHECK(r1 == std::optional>{}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 23, 6); + CHECK(r2 == std::optional>{}); + } + + SECTION("Cross plane") { + const auto r1 = + instance_domain_tile_offset({d1, d2, d3}, 40, 3); + CHECK(r1 == std::optional>{}); + + const auto r2 = + instance_domain_tile_offset({d1, d2, d3}, 77, 8); + CHECK(r2 == std::optional>{}); + } + + SECTION("Column major") { + const auto r1 = instance_domain_tile_offset( + {d1, d2, d3}, 54, 36, Layout::COL_MAJOR); + CHECK(r1 == std::vector{d1.domain, d2.domain, make_d3(3, 4)}); + + const auto r2 = instance_domain_tile_offset( + {d1, d2, d3}, 78, 12, Layout::COL_MAJOR); + CHECK(r2 == std::vector{d1.domain, make_d2(2, 5), make_d3(4, 4)}); + } + } + + rc::prop("any tiles", []() { + const Dim64 d1 = + *rc::make_dimension(std::nullopt, {16}); + const Dim64 d2 = + *rc::make_dimension(std::nullopt, {16}); + const Dim64 d3 = + *rc::make_dimension(std::nullopt, {16}); + const Layout tile_order = + *rc::gen::element(Layout::ROW_MAJOR, Layout::COL_MAJOR); + + instance_domain_tile_offset( + {d1, d2, d3}, tile_order); + }); +} diff --git a/tiledb/sm/tile/tile.h b/tiledb/sm/tile/tile.h index 3d730f86a20..d82382d9da2 100644 --- a/tiledb/sm/tile/tile.h +++ b/tiledb/sm/tile/tile.h @@ -458,6 +458,13 @@ class WriterTile : public TileBase { return filtered_buffer_; } + /** + * Returns the buffer that contains the filtered, on-disk format. + */ + inline const FilteredBuffer& filtered_buffer() const { + return filtered_buffer_; + } + /** * Write method used for var data. Resizes the internal buffer if needed. * diff --git a/tiledb/sm/tile/writer_tile_tuple.cc b/tiledb/sm/tile/writer_tile_tuple.cc index 9ce07d20f95..e6e823ec153 100644 --- a/tiledb/sm/tile/writer_tile_tuple.cc +++ b/tiledb/sm/tile/writer_tile_tuple.cc @@ -114,5 +114,20 @@ void WriterTileTuple::set_metadata( } } +std::optional WriterTileTuple::filtered_size() const { + uint64_t tile_size = 0; + if (var_size()) { + tile_size += offset_tile().filtered_buffer().size(); + tile_size += var_tile().filtered_buffer().size(); + } else { + tile_size += fixed_tile().filtered_buffer().size(); + } + + if (nullable()) { + tile_size += validity_tile().filtered_buffer().size(); + } + return tile_size; +} + } // namespace sm } // namespace tiledb diff --git a/tiledb/sm/tile/writer_tile_tuple.h b/tiledb/sm/tile/writer_tile_tuple.h index 8a2ca28938b..37339ecc836 100644 --- a/tiledb/sm/tile/writer_tile_tuple.h +++ b/tiledb/sm/tile/writer_tile_tuple.h @@ -212,6 +212,12 @@ class WriterTileTuple { return cell_num_; } + /** + * @return the total size of the filtered tiles, or `std::nullopt` if not + * filtered. + */ + std::optional filtered_size() const; + private: /* ********************************* */ /* PRIVATE ATTRIBUTES */ diff --git a/tiledb/type/range/range.h b/tiledb/type/range/range.h index 3ea85383a2e..89a65d7ff0a 100644 --- a/tiledb/type/range/range.h +++ b/tiledb/type/range/range.h @@ -291,6 +291,12 @@ class Range { return range_.data(); } + inline void* start_fixed() { + iassert(!var_size_); + iassert(range_.size() != 0); + return range_.data(); + } + /** Copies 'start' into this range's start bytes for fixed-size ranges. */ void set_start_fixed(const void* const start) { if (var_size_) { @@ -354,6 +360,13 @@ class Range { return &range_[end_pos]; } + void* end_fixed() { + iassert(!var_size_); + iassert(range_.size() != 0); + auto end_pos = range_.size() / 2; + return &range_[end_pos]; + } + /** Copies 'end' into this range's end bytes for fixed-size ranges. */ void set_end_fixed(const void* const end) { if (var_size_) {