Raise exception if filtering recursively normalized data or numpy array (#2641)

phoebusm · web-flow · commit e14ef988cd8f · 2025-10-01T14:03:28.000+01:00
#### Reference Issues/PRs  https://man312219.monday.com/boards/7852509418/pulses/9927290197 https://man312219.monday.com/boards/7852509418/pulses/9921000181 #### What does this implement or fix? Raise exception for reading recursively normalized data or numpy array if query_builder is passed, as query_builder doesnt work on such data anyway #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/pipeline/index_segment_reader.cpp b/cpp/arcticdb/pipeline/index_segment_reader.cpp
@@ -91,26 +91,4 @@ bool IndexSegmentReader::is_pickled() const {
 
 bool IndexSegmentReader::has_timestamp_index() const { return tsd().index().type_ == IndexDescriptor::Type::TIMESTAMP; }
 
-void check_column_and_date_range_filterable(
-        const pipelines::index::IndexSegmentReader& index_segment_reader, const ReadQuery& read_query
-) {
-    util::check(
-            !index_segment_reader.is_pickled() ||
-                    (!read_query.columns.has_value() && std::holds_alternative<std::monostate>(read_query.row_filter)),
-            "The data for this symbol is pickled and does not support column stats, date_range, row_range, or column "
-            "queries"
-    );
-    util::check(
-            index_segment_reader.has_timestamp_index() || !std::holds_alternative<IndexRange>(read_query.row_filter),
-            "Cannot apply date range filter to symbol with non-timestamp index"
-    );
-    sorting::check<ErrorCode::E_UNSORTED_DATA>(
-            index_segment_reader.sorted() == SortedValue::UNKNOWN ||
-                    index_segment_reader.sorted() == SortedValue::ASCENDING ||
-                    !std::holds_alternative<IndexRange>(read_query.row_filter),
-            "When filtering data using date_range, the symbol must be sorted in ascending order. ArcticDB believes it "
-            "is not sorted in ascending order and cannot therefore filter the data using date_range."
-    );
-}
-
 } // namespace  arcticdb::pipelines::index
diff --git a/cpp/arcticdb/pipeline/index_segment_reader.hpp b/cpp/arcticdb/pipeline/index_segment_reader.hpp
@@ -122,8 +122,4 @@ folly::Future<IndexSegmentReader> async_get_index_reader(
 
 IndexRange get_index_segment_range(const AtomKey& prev_index, const std::shared_ptr<Store>& store);
 
-void check_column_and_date_range_filterable(
-        const IndexSegmentReader& index_segment_reader, const ReadQuery& read_query
-);
-
 } // namespace arcticdb::pipelines::index
diff --git a/cpp/arcticdb/util/error_code.hpp b/cpp/arcticdb/util/error_code.hpp
@@ -74,6 +74,8 @@ inline std::unordered_map<ErrorCategory, const char*> get_error_category_names()
     ERROR_CODE(4002, E_UNSUPPORTED_COLUMN_TYPE)                                                                        \
     ERROR_CODE(4003, E_UNSUPPORTED_INDEX_TYPE)                                                                         \
     ERROR_CODE(4004, E_OPERATION_NOT_SUPPORTED_WITH_PICKLED_DATA)                                                      \
+    ERROR_CODE(4005, E_OPERATION_NOT_SUPPORTED_WITH_RECURSIVE_NORMALIZED_DATA)                                         \
+    ERROR_CODE(4006, E_OPERATION_NOT_SUPPORTED_WITH_NUMPY_ARRAY)                                                       \
     ERROR_CODE(5000, E_KEY_NOT_FOUND)                                                                                  \
     ERROR_CODE(5001, E_DUPLICATE_KEY)                                                                                  \
     ERROR_CODE(5002, E_SYMBOL_NOT_FOUND)                                                                               \
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -1211,6 +1211,52 @@ void check_multi_key_is_not_index_only(const PipelineContext& pipeline_context,
     );
 }
 
+void check_can_be_filtered(const std::shared_ptr<PipelineContext>& pipeline_context, const ReadQuery& read_query) {
+    // To remain backward compatibility, pending new major release to merge into below section
+    // Ticket: 18038782559
+    bool is_pickled = pipeline_context->norm_meta_ && pipeline_context->is_pickled();
+    util::check(
+            !is_pickled ||
+                    (!read_query.columns.has_value() && std::holds_alternative<std::monostate>(read_query.row_filter)),
+            "The data for this symbol is pickled and does not support column stats, date_range, row_range, or column "
+            "queries"
+    );
+    if (pipeline_context->multi_key_) {
+        check_multi_key_is_not_index_only(*pipeline_context, read_query);
+    }
+
+    // To keep
+    if (pipeline_context->desc_) {
+        util::check(
+                pipeline_context->descriptor().index().type() == IndexDescriptor::Type::TIMESTAMP ||
+                        !std::holds_alternative<IndexRange>(read_query.row_filter),
+                "Cannot apply date range filter to symbol with non-timestamp index"
+        );
+        auto sorted_value = pipeline_context->descriptor().sorted();
+        sorting::check<ErrorCode::E_UNSORTED_DATA>(
+                sorted_value == SortedValue::UNKNOWN || sorted_value == SortedValue::ASCENDING ||
+                        !std::holds_alternative<IndexRange>(read_query.row_filter),
+                "When filtering data using date_range, the symbol must be sorted in ascending order. ArcticDB believes "
+                "it "
+                "is not sorted in ascending order and cannot therefore filter the data using date_range."
+        );
+    }
+    bool is_query_empty =
+            (!read_query.columns && !read_query.row_range &&
+             std::holds_alternative<std::monostate>(read_query.row_filter) && read_query.clauses_.empty());
+    bool is_numpy_array = pipeline_context->norm_meta_ && pipeline_context->norm_meta_->has_np();
+    if (!is_query_empty) {
+        // Exception for filterig pickled data is skipped for now for backward compatibility
+        if (pipeline_context->multi_key_) {
+            schema::raise<ErrorCode::E_OPERATION_NOT_SUPPORTED_WITH_RECURSIVE_NORMALIZED_DATA>(
+                    "Cannot filter recursively normalized data"
+            );
+        } else if (is_numpy_array) {
+            schema::raise<ErrorCode::E_OPERATION_NOT_SUPPORTED_WITH_NUMPY_ARRAY>("Cannot filter numpy array");
+        }
+    }
+}
+
 static void read_indexed_keys_to_pipeline(
         const std::shared_ptr<Store>& store, const std::shared_ptr<PipelineContext>& pipeline_context,
         const VersionedItem& version_info, ReadQuery& read_query, const ReadOptions& read_options
@@ -1222,7 +1268,6 @@ static void read_indexed_keys_to_pipeline(
     auto index_segment_reader = std::move(*maybe_reader);
     ARCTICDB_DEBUG(log::version(), "Read index segment with {} keys", index_segment_reader.size());
     check_can_read_index_only_if_required(index_segment_reader, read_query);
-    check_column_and_date_range_filterable(index_segment_reader, read_query);
     add_index_columns_to_query(read_query, index_segment_reader.tsd());
 
     const auto& tsd = index_segment_reader.tsd();
@@ -1245,6 +1290,7 @@ static void read_indexed_keys_to_pipeline(
             std::move(*index_segment_reader.mutable_tsd().mutable_proto().mutable_user_meta())
     );
     pipeline_context->bucketize_dynamic_ = bucketize_dynamic;
+    check_can_be_filtered(pipeline_context, read_query);
     ARCTICDB_DEBUG(
             log::version(),
             "read_indexed_keys_to_pipeline: Symbol {} Found {} keys with {} total rows",
@@ -2570,8 +2616,11 @@ folly::Future<ReadVersionOutput> read_frame_for_version(
 ) {
     auto pipeline_context = setup_pipeline_context(store, version_info, *read_query, read_options);
     auto res_versioned_item = generate_result_versioned_item(version_info);
+
     if (pipeline_context->multi_key_) {
-        check_multi_key_is_not_index_only(*pipeline_context, *read_query);
+        if (read_query) {
+            check_can_be_filtered(pipeline_context, *read_query);
+        }
         return read_multi_key(store, *pipeline_context->multi_key_, handler_data, std::move(res_versioned_item.key_));
     }
     ARCTICDB_DEBUG(log::version(), "Fetching data to frame");
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
@@ -1067,6 +1067,9 @@ def equals(x, y):
     elif isinstance(x, np.ndarray):
         assert isinstance(y, np.ndarray)
         assert np.allclose(x, y)
+    elif isinstance(x, pd.DataFrame):
+        assert isinstance(y, pd.DataFrame)
+        assert_frame_equal(x, y)
     else:
         assert x == y
 
diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py
@@ -31,6 +31,7 @@
     generic_filter_test_strings,
     generic_filter_test_nans,
     unicode_symbols,
+    equals,
 )
 from arcticdb.util._versions import IS_PANDAS_TWO, PANDAS_VERSION, IS_NUMPY_TWO
 
@@ -1142,6 +1143,34 @@ def test_float32_binary_comparison(lmdb_version_store_v1):
             generic_filter_test(lib, symbol, q, expected)
 
 
+@pytest.mark.parametrize("data", ({"a": pd.DataFrame({"col": [0]})}, np.array([1, 2, 3, 4]), np.ndarray((3, 3))))
+@pytest.mark.parametrize("empty", (True, False))
+def test_filter_unfilterable_data(lmdb_version_store_v1, empty, data, sym):
+    lib = lmdb_version_store_v1
+    lib.write(sym, data, recursive_normalizers=True)
+
+    q = QueryBuilder()
+    if empty:
+        equals(lib.read(sym, query_builder=q).data, data)
+    else:
+        q = q[q["col"] == 0]
+        with pytest.raises(SchemaException):
+            lib.read(sym, query_builder=q)
+
+
+@pytest.mark.parametrize("data", ({"a": pd.DataFrame({"col": [0]})}, np.array([1, 2, 3, 4]), np.ndarray((3, 3))))
+@pytest.mark.parametrize("head", (True, False))
+def test_head_tail_unfilterable_data(lmdb_version_store_v1, head, sym, data):
+    lib = lmdb_version_store_v1
+    lib.write(sym, data, recursive_normalizers=True)
+
+    with pytest.raises(SchemaException):
+        if head:
+            lib.head(sym)
+        else:
+            lib.tail(sym)
+
+
 ################################
 # MIXED SCHEMA TESTS FROM HERE #
 ################################