Disable operation on objects of mismatched types leading to corruption [Bugfix 9754433454] (#2572)

vasil-pashov · web-flow · commit 8c2e441a97ed · 2025-08-14T14:34:08.000+03:00
#### Reference Issues/PRs Monday 9754433454 #### What does this implement or fix? Disable append/update of mismatching types. Now we only dataframes can be appended to dataframes, series can be appended to series, etc... The old behavior allowed appending a dataframe with series but it corrupted the data. Disable append and update of Series when the column name does not match the one on disk. Old behavior added a new column but corrupted the data. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/entity/types_proto.hpp b/cpp/arcticdb/entity/types_proto.hpp
@@ -102,5 +102,24 @@ struct formatter<arcticdb::entity::Field> {
             return fmt::format_to(ctx.out(), "FD<type={}>", fd.type());
     }
 };
+
+template<>
+struct formatter<arcticdb::proto::descriptors::NormalizationMetadata::InputTypeCase> {
+    template<typename ParseContext>
+    constexpr auto parse(ParseContext& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const arcticdb::proto::descriptors::NormalizationMetadata::InputTypeCase& type, FormatContext& ctx) const {
+        switch (type) {
+            case arcticdb::proto::descriptors::NormalizationMetadata::kDf: return fmt::format_to(ctx.out(), "DataFrame");
+            case arcticdb::proto::descriptors::NormalizationMetadata::kSeries: return fmt::format_to(ctx.out(), "Series");
+            case arcticdb::proto::descriptors::NormalizationMetadata::kTs: return fmt::format_to(ctx.out(), "TimeSeries");
+            case arcticdb::proto::descriptors::NormalizationMetadata::kMsgPackFrame: return fmt::format_to(ctx.out(), "Pickled data");
+            case arcticdb::proto::descriptors::NormalizationMetadata::kNp: return fmt::format_to(ctx.out(), "Array");
+            default: return fmt::format_to(ctx.out(), "Unknown");
+        }
+    }
+};
+
 } //namespace fmt
 
diff --git a/cpp/arcticdb/python/normalization_checks.cpp b/cpp/arcticdb/python/normalization_checks.cpp
@@ -187,6 +187,11 @@ void fix_normalization_or_throw(
     const pipelines::InputTensorFrame &new_frame) {
     auto &old_norm = existing_isr.tsd().proto().normalization();
     auto &new_norm = new_frame.norm_meta;
+    normalization::check<ErrorCode::E_INCOMPATIBLE_OBJECTS>(
+        old_norm.input_type_case() == new_frame.norm_meta.input_type_case(),
+        "{} can be performed only on objects of the same type. Existing type is {} new type is {}.",
+        is_append ? "Append" : "Update", old_norm.input_type_case(), new_frame.norm_meta.input_type_case()
+    );
     if (check_pandas_like(old_norm, new_norm)) {
         const IndexDescriptor::Type old_index_type = existing_isr.tsd().index().type();
         const IndexDescriptor::Type new_index_type = new_frame.desc.index().type();
diff --git a/cpp/arcticdb/version/schema_checks.cpp b/cpp/arcticdb/version/schema_checks.cpp
@@ -167,5 +167,19 @@ void fix_descriptor_mismatch_or_throw(
             new_frame.desc,
             operation);
     }
+    if (dynamic_schema && new_frame.norm_meta.has_series() && existing_isr.tsd().normalization().has_series()) {
+        const bool both_dont_have_name = !new_frame.norm_meta.series().common().has_name() &&
+            !existing_isr.tsd().normalization().series().common().has_name();
+        const bool both_have_name = new_frame.norm_meta.series().common().has_name() &&
+            existing_isr.tsd().normalization().series().common().has_name();
+        const auto name_or_default = [](const proto::descriptors::NormalizationMetadata& meta) {
+            return meta.series().common().has_name() ? meta.series().common().name() : "<series_name_not_set>";
+        };
+        schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+            both_dont_have_name || (both_have_name && new_frame.norm_meta.series().common().name() == existing_isr.tsd().normalization().series().common().name()),
+            "Series are not allowed to have different names for append and update even for dynamic schema. Existing name: {}, new name: {}",
+            name_or_default(existing_isr.tsd().normalization()),
+            name_or_default(new_frame.norm_meta));
+    }
 }
 } // namespace arcticdb
diff --git a/python/tests/unit/arcticdb/version_store/test_append.py b/python/tests/unit/arcticdb/version_store/test_append.py
@@ -12,6 +12,7 @@
     InternalException,
     NormalizationException,
     SortingException,
+    SchemaException
 )
 from arcticdb_ext import set_config_int
 from arcticdb.util.test import random_integers, assert_frame_equal
@@ -673,3 +674,51 @@ def test_defragment_no_work_to_do(sym, lmdb_version_store):
     assert list(lmdb_version_store.list_versions(sym))[0]["version"] == 0
     with pytest.raises(InternalException):
         lmdb_version_store.defragment_symbol_data(sym)
+
+@pytest.mark.parametrize("to_write, to_append", [
+    (pd.DataFrame({"a": [1]}), pd.Series([2])),
+    (pd.DataFrame({"a": [1]}), np.array([2])),
+    (pd.Series([1]), pd.DataFrame({"a": [2]})),
+    (pd.Series([1]), np.array([2])),
+    (np.array([1]), pd.DataFrame({"a": [2]})),
+    (np.array([1]), pd.Series([2])),
+    (pd.DataFrame({"a": [1], "b": [2]}), pd.Series([2])),
+    (pd.DataFrame({"a": [1], "b": [2]}), np.array([2])),
+    (pd.Series([1]), pd.DataFrame({"a": [2], "b": [2]})),
+    (np.array([1]), pd.DataFrame({"a": [2], "b": [2]}))
+])
+def test_append_mismatched_object_kind(to_write, to_append, lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.write("sym", to_write)
+    with pytest.raises(NormalizationException) as e:
+        lib.append("sym", to_append)
+    assert "Append" in str(e.value)
+
+@pytest.mark.parametrize("to_write, to_append", [
+    (pd.Series([1, 2, 3], name="name_1"), pd.Series([4, 5, 6], name="name_2")),
+    (
+            pd.Series([1, 2, 3], name="name_1", index=pd.DatetimeIndex([pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)])),
+            pd.Series([4, 5, 6], name="name_2", index=pd.DatetimeIndex([pd.Timestamp(3), pd.Timestamp(4), pd.Timestamp(5)]))
+    )
+])
+def test_append_series_with_different_column_name_throws(lmdb_version_store_dynamic_schema_v1, to_write, to_append):
+    # It makes sense to create a new column and turn the whole thing into a dataframe. This would require changes in the
+    # logic for storing normalization metadata which is tricky. Noone has requested this, so we just throw.
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.write("sym", to_write)
+    with pytest.raises(SchemaException) as e:
+        lib.append("sym", to_append)
+    assert "name_1" in str(e.value) and "name_2" in str(e.value)
+
+def test_append_series_with_different_row_range_index_name(lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    to_write = pd.Series([1, 2, 3])
+    to_write.index.name = "index_name_1"
+    to_append = pd.Series([4, 5, 6])
+    to_append.index.name = "index_name_2"
+    lib.write("sym", to_write)
+    lib.append("sym", to_append)
+    # The current behavior is the last modification operation is setting the index name.
+    # See Monday 9797097831, it would be best to require that index names are always matching. This is the case for
+    # datetime index because it's a physical column. It's a potentially breaking change.
+    assert lib.read("sym").data.index.name == "index_name_2"
diff --git a/python/tests/unit/arcticdb/version_store/test_update.py b/python/tests/unit/arcticdb/version_store/test_update.py
@@ -23,6 +23,8 @@
 from arcticdb.exceptions import (
     InternalException,
     SortingException,
+    NormalizationException,
+    SchemaException
 )
 from arcticdb_ext.version_store import StreamDescriptorMismatch
 from tests.util.date import DateRange
@@ -924,4 +926,32 @@ def test_regular_update_dynamic_schema_named_index(
     with pytest.raises(StreamDescriptorMismatch) as exception_info:
         lib.update(sym, df_1, upsert=True)
 
-    assert "date" in str(exception_info.value)
+    assert "date" in str(exception_info.value)
+
+@pytest.mark.parametrize("to_write, to_update", [
+    (pd.DataFrame({"a": [1]}, index=pd.DatetimeIndex([pd.Timestamp(0)])), pd.Series([2], index=pd.DatetimeIndex([pd.Timestamp(0)]))),
+    (pd.DataFrame({"a": [1]}, index=pd.DatetimeIndex([pd.Timestamp(0)])), np.array([2])),
+    (pd.Series([1], index=pd.DatetimeIndex([pd.Timestamp(0)])), pd.DataFrame({"a": [2]}, index=pd.DatetimeIndex([pd.Timestamp(0)]))),
+    (pd.Series([1], index=pd.DatetimeIndex([pd.Timestamp(0)])), np.array([2])),
+    (np.array([1]), pd.DataFrame({"a": [2]}, index=pd.DatetimeIndex([pd.Timestamp(0)]))),
+    (np.array([1]), pd.Series([2], index=pd.DatetimeIndex([pd.Timestamp(0)])))
+])
+def test_update_mismatched_object_kind(to_write, to_update, lmdb_version_store_dynamic_schema_v1):
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.write("sym", to_write)
+    if isinstance(to_update, np.ndarray) or isinstance(to_write, np.ndarray):
+        with pytest.raises(Exception) as e:
+            assert "Index mismatch" in str(e.value)
+    else:
+        with pytest.raises(NormalizationException) as e:
+            lib.update("sym", to_update)
+        assert "Update" in str(e.value)
+
+def test_update_series_with_different_column_name_throws(lmdb_version_store_dynamic_schema_v1):
+    # It makes sense to create a new column and turn the whole thing into a dataframe. This would require changes in the
+    # logic for storing normalization metadata which is tricky. Noone has requested this, so we just throw.
+    lib = lmdb_version_store_dynamic_schema_v1
+    lib.write("sym", pd.Series([1, 2, 3], name="name_1", index=pd.DatetimeIndex([pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)])))
+    with pytest.raises(SchemaException) as e:
+        lib.update("sym", pd.Series([1], name="name_2", index=pd.DatetimeIndex([pd.Timestamp(0)])))
+    assert "name_1" in str(e.value) and "name_2" in str(e.value)