[9898173277] Resolve row range edge cases for arrow (#2660)

IvoDD · web-flow · commit 3c7a4bbbebb6 · 2025-09-23T14:43:35.000+03:00
#### Reference Issues/PRs Fixes monday ref 9898173277 Fixes monday ref 9943908001 when using `query_builder`'s `date_range`. #### What does this implement or fix? - Small refactor to use abstract common `is_slice_in_row_range` and `is_slice_in_index_range` to be used both in regular reads and in processing pipeline. This resolves the issue fixed in #2632 for query builder date ranges as well. - If requested row/date range is empty, we no longer read any data keys (previously we would read one) - Removes several unused methods from `query.hpp` and `index_range.hpp` (index_range one was quite broken) - Adds arrow tests for: - open ended row/date range filters - empty row/date ranges - negative row ranges #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/entity/index_range.hpp b/cpp/arcticdb/entity/index_range.hpp
@@ -90,6 +90,10 @@ struct IndexRange {
         return left.start_ <= right.end_ && left.end_ >= right.start_;
     }
 
+    friend bool closed_aware_intersects(const IndexRange& left, const IndexRange& right) {
+        return left.inclusive_start() <= right.inclusive_end() && left.inclusive_end() >= right.inclusive_start();
+    }
+
     friend bool intersects(const IndexRange& rg, const IndexValue& start, const IndexValue& end) {
         if (!rg.specified_)
             return true;
@@ -104,24 +108,21 @@ struct IndexRange {
         return left.start_ == right.start_ && left.end_ == right.end_;
     }
 
-    void adjust_open_closed_interval() {
-        if (std::holds_alternative<NumericIndex>(start_) && !start_closed_) {
-            auto start = std::get<NumericIndex>(start_);
-            start_ = NumericIndex(start + 1);
-        }
-
-        if (std::holds_alternative<NumericIndex>(end_) && !end_closed_) {
-            auto end = std::get<NumericIndex>(end_);
-            end_ = NumericIndex(end - 1);
-        }
-    }
-
     IndexValue inclusive_end() const {
         if (std::holds_alternative<NumericIndex>(end_) && !end_closed_) {
             return NumericIndex(std::get<NumericIndex>(end_) - 1);
         }
         return end_;
     }
+
+    IndexValue inclusive_start() const {
+        if (std::holds_alternative<NumericIndex>(start_) && !start_closed_) {
+            return NumericIndex(std::get<NumericIndex>(start_) + 1);
+        }
+        return start_;
+    }
+
+    bool empty() const { return inclusive_start() > inclusive_end(); }
 };
 
 inline IndexRange unspecified_range() { return {}; }
diff --git a/cpp/arcticdb/pipeline/frame_slice.hpp b/cpp/arcticdb/pipeline/frame_slice.hpp
@@ -30,6 +30,8 @@ struct AxisRange : std::pair<size_t, size_t> {
 
     [[nodiscard]] size_t end() const { return second; }
 
+    [[nodiscard]] bool empty() const { return first >= second; }
+
     struct Hasher {
         template<class T>
         std::enable_if_t<std::is_base_of_v<AxisRange, std::decay_t<T>>, std::size_t> operator()(const T& r) const {
@@ -312,4 +314,17 @@ struct formatter<arcticdb::pipelines::SliceAndKey> {
     }
 };
 
+template<>
+struct formatter<arcticdb::pipelines::RangesAndKey> {
+    template<typename ParseContext>
+    constexpr auto parse(ParseContext& ctx) {
+        return ctx.begin();
+    }
+
+    template<typename FormatContext>
+    auto format(arcticdb::pipelines::RangesAndKey sk, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "{},{},{},{}", sk.row_range(), sk.col_range(), sk.key_, sk.is_incomplete());
+    }
+};
+
 } // namespace fmt
diff --git a/cpp/arcticdb/pipeline/query.cpp b/cpp/arcticdb/pipeline/query.cpp
@@ -16,17 +16,31 @@ namespace arcticdb::pipelines {
 using namespace arcticdb::stream;
 using namespace arcticdb::pipelines::index;
 
-IndexValue start_index(const std::vector<SliceAndKey>& sk, std::size_t row) { return sk[row].key().start_index(); }
-
-IndexValue start_index(const index::IndexSegmentReader& isr, std::size_t row) {
-    return index::index_value_from_segment(isr.seg(), row, index::Fields::start_index);
+RowRange slice_row_range_at(const IndexSegmentReader& isr, std::size_t row) {
+    auto start_row = isr.column(index::Fields::start_row).scalar_at<std::size_t>(row).value();
+    auto end_row = isr.column(index::Fields::end_row).scalar_at<std::size_t>(row).value();
+    return {start_row, end_row};
 }
 
-IndexValue end_index(const index::IndexSegmentReader& isr, std::size_t row) {
-    return index::index_value_from_segment(isr.seg(), row, index::Fields::end_index);
+RowRange slice_row_range_at(const std::vector<SliceAndKey>& sk, std::size_t row) { return sk[row].slice_.row_range; }
+
+bool is_slice_in_row_range(const RowRange& slice_row_range, const RowRange& row_filter) {
+    // If the row_filter is empty we return false (i.e. don't read the slice).
+    // This is required for cases like a range (3, 2) which falls completely within an index row (0, 10). We
+    // know that if the range is empty we don't need to read the data key for (0, 10) because it won't contain
+    // any elements within the empty range.
+    return slice_row_range.first < row_filter.second && slice_row_range.second > row_filter.first &&
+           !row_filter.empty();
 }
 
-IndexValue end_index(const std::vector<SliceAndKey>& sk, std::size_t row) { return sk[row].key().end_index(); }
+bool is_slice_in_index_range(IndexRange slice_index_range, const IndexRange& index_filter, bool is_read_operation) {
+    // Typically slice_index_range should be end exclusive, however due to old bugs we have old data written with
+    // inclusive end_index. So, when we are reading we explicitly set the interval as closed to be able to read
+    // old_data. The same fix should be done for updates, but that is not implemented yet and should be added with
+    // https://github.com/man-group/ArcticDB/issues/2655
+    slice_index_range.end_closed_ = is_read_operation;
+    return closed_aware_intersects(slice_index_range, index_filter) && !index_filter.empty();
+}
 
 template<typename ContainerType, typename IdxType>
 std::unique_ptr<util::BitSet> build_bitset_for_index(
@@ -92,16 +106,9 @@ std::unique_ptr<util::BitSet> build_bitset_for_index(
         auto start_idx_pos = start_idx_col.template begin<IndexTagType>();
         auto end_idx_pos = end_idx_col.template begin<IndexTagType>();
 
-        using RawType = typename IndexTagType::DataTypeTag::raw_type;
-        const auto range_start = std::get<timestamp>(rg.start_);
-        const auto range_end = std::get<timestamp>(rg.end_);
         for (auto i = 0u; i < container.size(); ++i) {
-            // If we are reading, we want to include the the end index, in order to support backwards compatibility with
-            // older versions. The same fix should be done for updates, but that is not implemented yet and should be
-            // added with https://github.com/man-group/ArcticDB/issues/2655
-            const auto adjusted_end_idx_pos = is_read_operation ? *end_idx_pos : *end_idx_pos - 1;
             const auto intersects =
-                    range_intersects<RawType>(range_start, range_end, *start_idx_pos, adjusted_end_idx_pos);
+                    is_slice_in_index_range(IndexRange(*start_idx_pos, *end_idx_pos), rg, is_read_operation);
             (*res)[i] = intersects;
             if (intersects)
                 ARCTICDB_DEBUG(log::version(), "range intersects at {}", i);
diff --git a/cpp/arcticdb/pipeline/query.hpp b/cpp/arcticdb/pipeline/query.hpp
@@ -160,28 +160,15 @@ inline FilterQuery<index::IndexSegmentReader> create_dynamic_col_filter(
     };
 }
 
-inline std::size_t start_row(const index::IndexSegmentReader& isr, std::size_t row) {
-    return isr.column(index::Fields::start_row).scalar_at<std::size_t>(row).value();
-}
-
-inline std::size_t start_row(const std::vector<SliceAndKey>& sk, std::size_t row) {
-    return sk[row].slice_.row_range.first;
-}
-
-inline std::size_t end_row(const index::IndexSegmentReader& isr, std::size_t row) {
-    return isr.column(index::Fields::end_row).scalar_at<std::size_t>(row).value();
-}
-
-inline std::size_t end_row(const std::vector<SliceAndKey>& sk, std::size_t row) {
-    return sk[row].slice_.row_range.second;
-}
+RowRange slice_row_range_at(const std::vector<SliceAndKey>& sk, std::size_t row);
+RowRange slice_row_range_at(const index::IndexSegmentReader& isr, std::size_t row);
 
 template<typename ContainerType>
 inline FilterQuery<ContainerType> create_row_filter(RowRange&& range) {
     return [rg = std::move(range)](const ContainerType& container, std::unique_ptr<util::BitSet>&& input) mutable {
         auto res = std::make_unique<util::BitSet>(static_cast<util::BitSetSizeType>(container.size()));
         for (std::size_t r = 0, end = container.size(); r < end; ++r) {
-            bool included = start_row(container, r) < rg.second && end_row(container, r) > rg.first;
+            bool included = is_slice_in_row_range(slice_row_range_at(container, r), rg);
             ARCTICDB_DEBUG(log::version(), "Row {} is {} range {}", r, included ? "inside" : "outside", rg);
             (*res)[r] = included;
         }
@@ -194,18 +181,8 @@ inline FilterQuery<ContainerType> create_row_filter(RowRange&& range) {
     };
 }
 
-IndexValue start_index(const std::vector<SliceAndKey>& sk, std::size_t row);
-
-IndexValue start_index(const index::IndexSegmentReader& isr, std::size_t row);
-
-IndexValue end_index(const index::IndexSegmentReader& isr, std::size_t row);
-
-IndexValue end_index(const std::vector<SliceAndKey>& sk, std::size_t row);
-
-template<typename RawType>
-bool range_intersects(RawType a_start, RawType a_end, RawType b_start, RawType b_end) {
-    return a_start <= b_end && a_end >= b_start;
-}
+bool is_slice_in_row_range(const RowRange& slice_row_range, const RowRange& row_filter);
+bool is_slice_in_index_range(IndexRange slice_index_range, const IndexRange& index_filter, bool is_read_operation);
 
 template<typename ContainerType, typename IdxType>
 std::unique_ptr<util::BitSet> build_bitset_for_index(
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -107,13 +107,15 @@ SegmentInMemory allocate_chunked_frame(const std::shared_ptr<PipelineContext>& c
     };
     auto handlers = TypeHandlerRegistry::instance();
 
-    for (auto& column : output.columns()) {
-        auto handler = handlers->get_handler(output_format, column->type());
-        const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL);
-        for (auto block_row_count : block_row_counts) {
-            const auto bytes = block_row_count * data_size;
-            column->allocate_data(bytes);
-            column->advance_data(bytes);
+    if (row_count > 0) {
+        for (auto& column : output.columns()) {
+            auto handler = handlers->get_handler(output_format, column->type());
+            const auto data_size = data_type_size(column->type(), output_format, DataTypeMode::EXTERNAL);
+            for (auto block_row_count : block_row_counts) {
+                const auto bytes = block_row_count * data_size;
+                column->allocate_data(bytes);
+                column->advance_data(bytes);
+            }
         }
     }
 
diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
@@ -17,6 +17,7 @@
 #include <arcticdb/processing/clause.hpp>
 #include <arcticdb/pipeline/column_stats.hpp>
 #include <arcticdb/pipeline/frame_slice.hpp>
+#include <arcticdb/pipeline/query.hpp>
 #include <arcticdb/util/test/random_throw.hpp>
 #include <ankerl/unordered_dense.h>
 #include <arcticdb/util/movable_priority_queue.hpp>
@@ -1376,12 +1377,13 @@ std::vector<EntityId> ColumnStatsGenerationClause::process(std::vector<EntityId>
 }
 
 std::vector<std::vector<size_t>> RowRangeClause::structure_for_processing(std::vector<RangesAndKey>& ranges_and_keys) {
+    auto row_range_filter = RowRange{start_, end_};
     ranges_and_keys.erase(
             std::remove_if(
                     ranges_and_keys.begin(),
                     ranges_and_keys.end(),
-                    [this](const RangesAndKey& ranges_and_key) {
-                        return ranges_and_key.row_range_.start() >= end_ || ranges_and_key.row_range_.end() <= start_;
+                    [&](const RangesAndKey& ranges_and_key) {
+                        return !is_slice_in_row_range(ranges_and_key.row_range(), row_range_filter);
                     }
             ),
             ranges_and_keys.end()
@@ -1532,13 +1534,14 @@ std::vector<std::vector<size_t>> DateRangeClause::structure_for_processing(std::
             processing_config_.index_type_ == IndexDescriptor::Type::TIMESTAMP,
             "Cannot use date range with non-timestamp indexed data"
     );
+    auto index_filter = IndexRange(start_, end_);
     ranges_and_keys.erase(
             std::remove_if(
                     ranges_and_keys.begin(),
                     ranges_and_keys.end(),
-                    [this](const RangesAndKey& ranges_and_key) {
-                        auto [start_index, end_index] = ranges_and_key.key_.time_range();
-                        return start_index > end_ || end_index <= start_;
+                    [&](const RangesAndKey& ranges_and_key) {
+                        auto slice_index_range = IndexRange(ranges_and_key.key_.time_range());
+                        return !is_slice_in_index_range(slice_index_range, index_filter, true);
                     }
             ),
             ranges_and_keys.end()
diff --git a/python/tests/compat/arcticdb/test_compatibility.py b/python/tests/compat/arcticdb/test_compatibility.py
@@ -434,11 +434,14 @@ def test_compat_update_old_updated_data(pandas_v1_venv, s3_ssl_disabled_storage,
         (None, pd.Timestamp("2025-01-03 10:00:00")),  # Intersects with problematic range at end
     ],
 )
-def test_compat_arrow_range_old_updated_data(pandas_v1_venv, s3_ssl_disabled_storage, lib_name, date_range):
+@pytest.mark.parametrize("use_query_builder", [True, False])
+def test_compat_date_range_old_updated_data(
+    pandas_v1_venv, s3_ssl_disabled_storage, lib_name, date_range, use_query_builder, any_output_format
+):
     # There was a bug where data written using update and old versions of ArcticDB produced data keys where the
     # end_index value was not 1 nanosecond larger than the last index value in the segment (as it should be), but
     # instead contained the start of the date_range passed into the update call.
-    # We want to verify C++ truncation within arrow works with the old broken end index values.
+    # We want to verify reading date range of the old broken end index values works.
     arctic_uri = s3_ssl_disabled_storage.arctic_uri
     with CompatLibrary(pandas_v1_venv, arctic_uri, lib_name) as compat:
         sym = "sym"
@@ -456,6 +459,11 @@ def test_compat_arrow_range_old_updated_data(pandas_v1_venv, s3_ssl_disabled_sto
         compat.old_lib.update(sym, df_1, '(pd.Timestamp("2025-01-03 00:00:00"), None)')
         compat.old_lib.update(sym, df_2, '(pd.Timestamp("2025-01-04 00:00:00"), None)')
 
+        expected_df = pd.concat([df_0.iloc[:1], df_1.iloc[:1], df_2])
+        filter_after_start = expected_df.index >= date_range[0] if date_range[0] else True
+        filter_before_end = expected_df.index <= date_range[1] if date_range[1] else True
+        expected_df = expected_df[filter_after_start & filter_before_end]
+
         # Resample using current version
         with compat.current_version() as curr:
             index_df = curr.lib._nvs.read_index(sym)
@@ -465,9 +473,12 @@ def test_compat_arrow_range_old_updated_data(pandas_v1_venv, s3_ssl_disabled_sto
             assert index_df["end_index"].iloc[1] == pd.Timestamp("2025-01-04 00:00:00")
             assert index_df["end_index"].iloc[2] == pd.Timestamp("2025-01-05 23:00:00") + pd.Timedelta(1, unit="ns")
 
-            arrow_table = curr.lib.read(sym, date_range=date_range, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
-            expected_df = curr.lib.read(sym, date_range=date_range).data
-            assert_frame_equal_with_arrow(arrow_table, expected_df)
+            if use_query_builder:
+                q = QueryBuilder().date_range(date_range)
+                result = curr.lib.read(sym, query_builder=q, output_format=any_output_format).data
+            else:
+                result = curr.lib.read(sym, date_range=date_range, output_format=any_output_format).data
+            assert_frame_equal_with_arrow(result, expected_df)
 
 
 def test_norm_meta_column_and_index_names_write_old_read_new(old_venv_and_arctic_uri, lib_name):
diff --git a/python/tests/integration/arcticdb/version_store/test_num_storage_operations.py b/python/tests/integration/arcticdb/version_store/test_num_storage_operations.py
@@ -12,6 +12,7 @@
     config_context,
     config_context_string,
 )
+from arcticdb.version_store.processing import QueryBuilder
 import arcticdb.toolbox.query_stats as qs
 
 
@@ -281,6 +282,9 @@ def get_update_dataframe(start_index, end_index):
 
 
 def get_num_data_keys_intersecting_row_range(index, start, end):
+    if end <= start:
+        # We shouldn't read any data keys if row range is empty
+        return 0
     count = 0
     for index, row in index.iterrows():
         if (start is None or start < row["end_row"]) and (end is None or end > row["start_row"]):
@@ -289,6 +293,9 @@ def get_num_data_keys_intersecting_row_range(index, start, end):
 
 
 def get_num_data_keys_intersecting_date_range(index, start, end, exclude_fully_included=False):
+    if start is not None and end is not None and end < start:
+        # We shouldn't read any data keys if date range is empty
+        return 0
     count = 0
     for i, (_, row) in enumerate(index.reset_index().iterrows()):
         # end is inclusive when doing date_range but end_index in the column is exclusive
@@ -325,8 +332,11 @@ def get_num_data_keys_intersecting_date_range(index, start, end, exclude_fully_i
 @pytest.mark.parametrize(
     "row_range_start, row_range_end", [(0, 0), (5, 5), (0, 1), (1, 2), (5, 6), (0, 4), (1, 5), (0, 6), (6, 15), (0, 15)]
 )
+@pytest.mark.parametrize("use_query_builder", [True, False])
 @pytest.mark.parametrize("dynamic_schema", [True, False])
-def test_row_range_num_reads(s3_store_factory, clear_query_stats, dynamic_schema, row_range_start, row_range_end):
+def test_row_range_num_reads(
+    s3_store_factory, clear_query_stats, dynamic_schema, row_range_start, row_range_end, use_query_builder
+):
     with config_context("VersionMap.ReloadInterval", 0):
         lib = s3_store_factory(column_group_size=2, segment_row_size=2, dynamic_schema=dynamic_schema)
         qs.enable()
@@ -343,7 +353,12 @@ def test_row_range_num_reads(s3_store_factory, clear_query_stats, dynamic_schema
         assert sum_operations_by_type(stats, "S3_GetObject") == 2
 
         expected_df = df.iloc[row_range_start:row_range_end]
-        result_df = lib.read(sym, row_range=(row_range_start, row_range_end)).data
+        row_range = (row_range_start, row_range_end)
+        if use_query_builder:
+            q = QueryBuilder().row_range(row_range)
+            result_df = lib.read(sym, query_builder=q).data
+        else:
+            result_df = lib.read(sym, row_range=row_range).data
         stats = qs.get_query_stats()
         qs.reset_stats()
         assert_frame_equal(result_df, expected_df)
@@ -353,8 +368,11 @@ def test_row_range_num_reads(s3_store_factory, clear_query_stats, dynamic_schema
 
 
 @pytest.mark.parametrize("date_range_start, date_range_end", date_ranges_to_test)
+@pytest.mark.parametrize("use_query_builder", [True, False])
 @pytest.mark.parametrize("dynamic_schema", [True, False])
-def test_date_range_num_reads(s3_store_factory, clear_query_stats, dynamic_schema, date_range_start, date_range_end):
+def test_date_range_num_reads(
+    s3_store_factory, clear_query_stats, dynamic_schema, date_range_start, date_range_end, use_query_builder
+):
     with config_context("VersionMap.ReloadInterval", 0):
         lib = s3_store_factory(column_group_size=2, segment_row_size=2, dynamic_schema=dynamic_schema)
         qs.enable()
@@ -373,7 +391,12 @@ def test_date_range_num_reads(s3_store_factory, clear_query_stats, dynamic_schem
         assert sum_operations_by_type(stats, "S3_GetObject") == 2
 
         expected_df = df.loc[date_range_start:date_range_end]
-        result_df = lib.read(sym, date_range=(date_range_start, date_range_end)).data
+        date_range = (date_range_start, date_range_end)
+        if use_query_builder:
+            q = QueryBuilder().date_range(date_range)
+            result_df = lib.read(sym, query_builder=q).data
+        else:
+            result_df = lib.read(sym, date_range=date_range).data
         stats = qs.get_query_stats()
         qs.reset_stats()
         assert_frame_equal(result_df, expected_df)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py