Skip to content

Commit 8c2e441

Browse files
authored
Disable operation on objects of mismatched types leading to corruption [Bugfix 9754433454] (#2572)
#### Reference Issues/PRs Monday 9754433454 #### What does this implement or fix? Disable append/update of mismatching types. Now we only dataframes can be appended to dataframes, series can be appended to series, etc... The old behavior allowed appending a dataframe with series but it corrupted the data. Disable append and update of Series when the column name does not match the one on disk. Old behavior added a new column but corrupted the data. #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent bf839b5 commit 8c2e441

File tree

5 files changed

+118
-1
lines changed

5 files changed

+118
-1
lines changed

cpp/arcticdb/entity/types_proto.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,5 +102,24 @@ struct formatter<arcticdb::entity::Field> {
102102
return fmt::format_to(ctx.out(), "FD<type={}>", fd.type());
103103
}
104104
};
105+
106+
template<>
107+
struct formatter<arcticdb::proto::descriptors::NormalizationMetadata::InputTypeCase> {
108+
template<typename ParseContext>
109+
constexpr auto parse(ParseContext& ctx) { return ctx.begin(); }
110+
111+
template<typename FormatContext>
112+
auto format(const arcticdb::proto::descriptors::NormalizationMetadata::InputTypeCase& type, FormatContext& ctx) const {
113+
switch (type) {
114+
case arcticdb::proto::descriptors::NormalizationMetadata::kDf: return fmt::format_to(ctx.out(), "DataFrame");
115+
case arcticdb::proto::descriptors::NormalizationMetadata::kSeries: return fmt::format_to(ctx.out(), "Series");
116+
case arcticdb::proto::descriptors::NormalizationMetadata::kTs: return fmt::format_to(ctx.out(), "TimeSeries");
117+
case arcticdb::proto::descriptors::NormalizationMetadata::kMsgPackFrame: return fmt::format_to(ctx.out(), "Pickled data");
118+
case arcticdb::proto::descriptors::NormalizationMetadata::kNp: return fmt::format_to(ctx.out(), "Array");
119+
default: return fmt::format_to(ctx.out(), "Unknown");
120+
}
121+
}
122+
};
123+
105124
} //namespace fmt
106125

cpp/arcticdb/python/normalization_checks.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ void fix_normalization_or_throw(
187187
const pipelines::InputTensorFrame &new_frame) {
188188
auto &old_norm = existing_isr.tsd().proto().normalization();
189189
auto &new_norm = new_frame.norm_meta;
190+
normalization::check<ErrorCode::E_INCOMPATIBLE_OBJECTS>(
191+
old_norm.input_type_case() == new_frame.norm_meta.input_type_case(),
192+
"{} can be performed only on objects of the same type. Existing type is {} new type is {}.",
193+
is_append ? "Append" : "Update", old_norm.input_type_case(), new_frame.norm_meta.input_type_case()
194+
);
190195
if (check_pandas_like(old_norm, new_norm)) {
191196
const IndexDescriptor::Type old_index_type = existing_isr.tsd().index().type();
192197
const IndexDescriptor::Type new_index_type = new_frame.desc.index().type();

cpp/arcticdb/version/schema_checks.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,5 +167,19 @@ void fix_descriptor_mismatch_or_throw(
167167
new_frame.desc,
168168
operation);
169169
}
170+
if (dynamic_schema && new_frame.norm_meta.has_series() && existing_isr.tsd().normalization().has_series()) {
171+
const bool both_dont_have_name = !new_frame.norm_meta.series().common().has_name() &&
172+
!existing_isr.tsd().normalization().series().common().has_name();
173+
const bool both_have_name = new_frame.norm_meta.series().common().has_name() &&
174+
existing_isr.tsd().normalization().series().common().has_name();
175+
const auto name_or_default = [](const proto::descriptors::NormalizationMetadata& meta) {
176+
return meta.series().common().has_name() ? meta.series().common().name() : "<series_name_not_set>";
177+
};
178+
schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
179+
both_dont_have_name || (both_have_name && new_frame.norm_meta.series().common().name() == existing_isr.tsd().normalization().series().common().name()),
180+
"Series are not allowed to have different names for append and update even for dynamic schema. Existing name: {}, new name: {}",
181+
name_or_default(existing_isr.tsd().normalization()),
182+
name_or_default(new_frame.norm_meta));
183+
}
170184
}
171185
} // namespace arcticdb

python/tests/unit/arcticdb/version_store/test_append.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
InternalException,
1313
NormalizationException,
1414
SortingException,
15+
SchemaException
1516
)
1617
from arcticdb_ext import set_config_int
1718
from arcticdb.util.test import random_integers, assert_frame_equal
@@ -673,3 +674,51 @@ def test_defragment_no_work_to_do(sym, lmdb_version_store):
673674
assert list(lmdb_version_store.list_versions(sym))[0]["version"] == 0
674675
with pytest.raises(InternalException):
675676
lmdb_version_store.defragment_symbol_data(sym)
677+
678+
@pytest.mark.parametrize("to_write, to_append", [
679+
(pd.DataFrame({"a": [1]}), pd.Series([2])),
680+
(pd.DataFrame({"a": [1]}), np.array([2])),
681+
(pd.Series([1]), pd.DataFrame({"a": [2]})),
682+
(pd.Series([1]), np.array([2])),
683+
(np.array([1]), pd.DataFrame({"a": [2]})),
684+
(np.array([1]), pd.Series([2])),
685+
(pd.DataFrame({"a": [1], "b": [2]}), pd.Series([2])),
686+
(pd.DataFrame({"a": [1], "b": [2]}), np.array([2])),
687+
(pd.Series([1]), pd.DataFrame({"a": [2], "b": [2]})),
688+
(np.array([1]), pd.DataFrame({"a": [2], "b": [2]}))
689+
])
690+
def test_append_mismatched_object_kind(to_write, to_append, lmdb_version_store_dynamic_schema_v1):
691+
lib = lmdb_version_store_dynamic_schema_v1
692+
lib.write("sym", to_write)
693+
with pytest.raises(NormalizationException) as e:
694+
lib.append("sym", to_append)
695+
assert "Append" in str(e.value)
696+
697+
@pytest.mark.parametrize("to_write, to_append", [
698+
(pd.Series([1, 2, 3], name="name_1"), pd.Series([4, 5, 6], name="name_2")),
699+
(
700+
pd.Series([1, 2, 3], name="name_1", index=pd.DatetimeIndex([pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)])),
701+
pd.Series([4, 5, 6], name="name_2", index=pd.DatetimeIndex([pd.Timestamp(3), pd.Timestamp(4), pd.Timestamp(5)]))
702+
)
703+
])
704+
def test_append_series_with_different_column_name_throws(lmdb_version_store_dynamic_schema_v1, to_write, to_append):
705+
# It makes sense to create a new column and turn the whole thing into a dataframe. This would require changes in the
706+
# logic for storing normalization metadata which is tricky. Noone has requested this, so we just throw.
707+
lib = lmdb_version_store_dynamic_schema_v1
708+
lib.write("sym", to_write)
709+
with pytest.raises(SchemaException) as e:
710+
lib.append("sym", to_append)
711+
assert "name_1" in str(e.value) and "name_2" in str(e.value)
712+
713+
def test_append_series_with_different_row_range_index_name(lmdb_version_store_dynamic_schema_v1):
714+
lib = lmdb_version_store_dynamic_schema_v1
715+
to_write = pd.Series([1, 2, 3])
716+
to_write.index.name = "index_name_1"
717+
to_append = pd.Series([4, 5, 6])
718+
to_append.index.name = "index_name_2"
719+
lib.write("sym", to_write)
720+
lib.append("sym", to_append)
721+
# The current behavior is the last modification operation is setting the index name.
722+
# See Monday 9797097831, it would be best to require that index names are always matching. This is the case for
723+
# datetime index because it's a physical column. It's a potentially breaking change.
724+
assert lib.read("sym").data.index.name == "index_name_2"

python/tests/unit/arcticdb/version_store/test_update.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from arcticdb.exceptions import (
2424
InternalException,
2525
SortingException,
26+
NormalizationException,
27+
SchemaException
2628
)
2729
from arcticdb_ext.version_store import StreamDescriptorMismatch
2830
from tests.util.date import DateRange
@@ -924,4 +926,32 @@ def test_regular_update_dynamic_schema_named_index(
924926
with pytest.raises(StreamDescriptorMismatch) as exception_info:
925927
lib.update(sym, df_1, upsert=True)
926928

927-
assert "date" in str(exception_info.value)
929+
assert "date" in str(exception_info.value)
930+
931+
@pytest.mark.parametrize("to_write, to_update", [
932+
(pd.DataFrame({"a": [1]}, index=pd.DatetimeIndex([pd.Timestamp(0)])), pd.Series([2], index=pd.DatetimeIndex([pd.Timestamp(0)]))),
933+
(pd.DataFrame({"a": [1]}, index=pd.DatetimeIndex([pd.Timestamp(0)])), np.array([2])),
934+
(pd.Series([1], index=pd.DatetimeIndex([pd.Timestamp(0)])), pd.DataFrame({"a": [2]}, index=pd.DatetimeIndex([pd.Timestamp(0)]))),
935+
(pd.Series([1], index=pd.DatetimeIndex([pd.Timestamp(0)])), np.array([2])),
936+
(np.array([1]), pd.DataFrame({"a": [2]}, index=pd.DatetimeIndex([pd.Timestamp(0)]))),
937+
(np.array([1]), pd.Series([2], index=pd.DatetimeIndex([pd.Timestamp(0)])))
938+
])
939+
def test_update_mismatched_object_kind(to_write, to_update, lmdb_version_store_dynamic_schema_v1):
940+
lib = lmdb_version_store_dynamic_schema_v1
941+
lib.write("sym", to_write)
942+
if isinstance(to_update, np.ndarray) or isinstance(to_write, np.ndarray):
943+
with pytest.raises(Exception) as e:
944+
assert "Index mismatch" in str(e.value)
945+
else:
946+
with pytest.raises(NormalizationException) as e:
947+
lib.update("sym", to_update)
948+
assert "Update" in str(e.value)
949+
950+
def test_update_series_with_different_column_name_throws(lmdb_version_store_dynamic_schema_v1):
951+
# It makes sense to create a new column and turn the whole thing into a dataframe. This would require changes in the
952+
# logic for storing normalization metadata which is tricky. Noone has requested this, so we just throw.
953+
lib = lmdb_version_store_dynamic_schema_v1
954+
lib.write("sym", pd.Series([1, 2, 3], name="name_1", index=pd.DatetimeIndex([pd.Timestamp(0), pd.Timestamp(1), pd.Timestamp(2)])))
955+
with pytest.raises(SchemaException) as e:
956+
lib.update("sym", pd.Series([1], name="name_2", index=pd.DatetimeIndex([pd.Timestamp(0)])))
957+
assert "name_1" in str(e.value) and "name_2" in str(e.value)

0 commit comments

Comments
 (0)