Skip to content

Commit 6c2363b

Browse files
authored
chore: [sc-63841] [core] adding multiple attributes with schema evolution does not add them in a predictable order (#5470)
Story details: https://app.shortcut.com/tiledb-inc/story/63841 Previously attributes were stored in the `SchemaEvolution` via `std::unordered_map` which resulted in an arbitrary order that the attributes would be appended to the target schema. This pull request uses `std::vector` instead to preserve the order of added attributes. Lookups by attribute name now require a linear scan of the added attributes. This will probably be faster than the `unordered_map` lookup was for small numbers of attributes; when there are many attributes it may not be, but it should be a rare use case where that would be noticeable. I also noticed that it is possible to evolve into an invalid schema which cannot be loaded. This pull request adds an integrity check before returning a successful schema evolution to fix this. --- TYPE: IMPROVEMENT DESC: Schema evolution consistent attribute order
1 parent 74e775a commit 6c2363b

File tree

8 files changed

+330
-40
lines changed

8 files changed

+330
-40
lines changed

test/src/unit-cppapi-schema-evolution.cc

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@
3232

3333
#include <test/support/src/vfs_helpers.h>
3434
#include <test/support/tdb_catch.h>
35+
#include "test/support/src/array_helpers.h"
36+
#include "test/support/src/array_schema_helpers.h"
3537
#include "test/support/src/mem_helpers.h"
38+
3639
#include "tiledb/sm/array_schema/array_schema.h"
3740
#include "tiledb/sm/array_schema/array_schema_evolution.h"
3841
#include "tiledb/sm/array_schema/attribute.h"
@@ -48,6 +51,23 @@
4851

4952
using namespace tiledb;
5053

54+
/**
55+
* @return a simple schema with dimension d1 and attribute a1
56+
*/
57+
static ArraySchema simple_schema(Context& ctx) {
58+
Domain domain(ctx);
59+
auto d1 = Dimension::create<int>(ctx, "d1", {{-100, 100}}, 10);
60+
domain.add_dimension(d1);
61+
62+
auto a1 = Attribute::create<int>(ctx, "a1");
63+
64+
ArraySchema schema(ctx, TILEDB_DENSE);
65+
schema.set_domain(domain);
66+
schema.add_attribute(a1);
67+
68+
return schema;
69+
}
70+
5171
TEST_CASE(
5272
"C++ API: SchemaEvolution, add and drop attributes",
5373
"[cppapi][schema][evolution][add][drop][rest]") {
@@ -950,3 +970,182 @@ TEST_CASE(
950970

951971
CHECK_NOTHROW(ase->evolve_schema(schema));
952972
}
973+
974+
TEST_CASE(
975+
"C++ API: SchemaEvolution add multiple attributes",
976+
"[cppapi][schema][evolution][add]") {
977+
test::VFSTestSetup vfs_test_setup;
978+
Context ctx{vfs_test_setup.ctx()};
979+
auto array_uri{vfs_test_setup.array_uri(
980+
"test_schema_evolution_add_multiple_attributes")};
981+
982+
// create initial array
983+
Domain domain(ctx);
984+
auto d1 = Dimension::create<int>(ctx, "d1", {{-100, 100}}, 10);
985+
domain.add_dimension(d1);
986+
987+
auto a1 = Attribute::create<int>(ctx, "a1");
988+
989+
ArraySchema schema(ctx, TILEDB_DENSE);
990+
schema.set_domain(domain);
991+
schema.add_attribute(a1);
992+
993+
auto add_attributes = std::vector<Attribute>{
994+
Attribute::create<int>(ctx, "a2"),
995+
Attribute::create<int>(ctx, "a3"),
996+
Attribute::create<int>(ctx, "a4")};
997+
998+
auto permutation = GENERATE(
999+
std::vector<int>{0, 1, 2},
1000+
std::vector<int>{0, 2, 1},
1001+
std::vector<int>{1, 0, 2},
1002+
std::vector<int>{1, 2, 0},
1003+
std::vector<int>{2, 0, 1},
1004+
std::vector<int>{2, 1, 0});
1005+
1006+
DYNAMIC_SECTION(
1007+
"Add a1/a2/a3 in permutation: " + std::to_string(permutation[0]) + "/" +
1008+
std::to_string(permutation[1]) + std::to_string(permutation[2])) {
1009+
// create array
1010+
Array::create(array_uri, schema);
1011+
test::DeleteArrayGuard guard(ctx.ptr().get(), array_uri.c_str());
1012+
1013+
// evolve it
1014+
auto evolution = ArraySchemaEvolution(ctx);
1015+
for (auto idx : permutation) {
1016+
evolution.add_attribute(add_attributes[idx]);
1017+
}
1018+
evolution.array_evolve(array_uri);
1019+
1020+
// check attribute order
1021+
auto schema = Array::load_schema(ctx, array_uri);
1022+
std::vector<Attribute> attributes;
1023+
for (unsigned a = 0; a < schema.attribute_num(); a++) {
1024+
attributes.push_back(schema.attribute(a));
1025+
}
1026+
1027+
CHECK(attributes.size() == 4);
1028+
if (attributes.size() >= 1) {
1029+
CHECK(test::is_equivalent_attribute(attributes[0], a1));
1030+
}
1031+
if (attributes.size() >= 2) {
1032+
CHECK(test::is_equivalent_attribute(
1033+
attributes[1], add_attributes[permutation[0]]));
1034+
}
1035+
if (attributes.size() >= 3) {
1036+
CHECK(test::is_equivalent_attribute(
1037+
attributes[2], add_attributes[permutation[1]]));
1038+
}
1039+
if (attributes.size() >= 4) {
1040+
CHECK(test::is_equivalent_attribute(
1041+
attributes[3], add_attributes[permutation[2]]));
1042+
}
1043+
}
1044+
}
1045+
1046+
TEST_CASE(
1047+
"C++ API: SchemaEvolution add duplicate attribute",
1048+
"[cppapi][schema][evolution][add]") {
1049+
test::VFSTestSetup vfs_test_setup;
1050+
Context ctx{vfs_test_setup.ctx()};
1051+
auto array_uri{vfs_test_setup.array_uri(
1052+
"test_schema_evolution_add_duplicate_attribute")};
1053+
1054+
// create initial array
1055+
Domain domain(ctx);
1056+
auto d1 = Dimension::create<int>(ctx, "d1", {{-100, 100}}, 10);
1057+
domain.add_dimension(d1);
1058+
1059+
auto a1 = Attribute::create<int>(ctx, "a1");
1060+
1061+
ArraySchema schema(ctx, TILEDB_DENSE);
1062+
schema.set_domain(domain);
1063+
schema.add_attribute(a1);
1064+
1065+
SECTION("Add attribute to evolution twice") {
1066+
// try evolving
1067+
auto evolution = ArraySchemaEvolution(ctx);
1068+
evolution.add_attribute(Attribute::create<int>(ctx, "a2"));
1069+
CHECK_THROWS(evolution.add_attribute(Attribute::create<int>(ctx, "a2")));
1070+
1071+
evolution.drop_attribute("a2");
1072+
evolution.add_attribute(Attribute::create<int>(ctx, "a2"));
1073+
CHECK_THROWS(evolution.add_attribute(Attribute::create<int>(ctx, "a2")));
1074+
}
1075+
1076+
SECTION("Add attribute with same name as schema attribute") {
1077+
// create array
1078+
Array::create(array_uri, schema);
1079+
test::DeleteArrayGuard guard(ctx.ptr().get(), array_uri.c_str());
1080+
1081+
// try evolving
1082+
auto evolution = ArraySchemaEvolution(ctx);
1083+
evolution.add_attribute(Attribute::create<int>(ctx, "a1"));
1084+
1085+
// should throw, cannot add an attribute with the same name
1086+
CHECK_THROWS(evolution.array_evolve(array_uri));
1087+
1088+
// load schema back should succeed
1089+
CHECK_NOTHROW(Array::load_schema(ctx, array_uri));
1090+
}
1091+
}
1092+
1093+
TEST_CASE(
1094+
"C++ API: SchemaEvolution drop last attribute",
1095+
"[cppapi][schema][evolution][drop]") {
1096+
test::VFSTestSetup vfs_test_setup;
1097+
Context ctx{vfs_test_setup.ctx()};
1098+
auto array_uri{
1099+
vfs_test_setup.array_uri("test_schema_evolution_drop_last_attribute")};
1100+
1101+
// create array
1102+
ArraySchema schema = simple_schema(ctx);
1103+
Array::create(array_uri, schema);
1104+
test::DeleteArrayGuard guard(ctx.ptr().get(), array_uri.c_str());
1105+
1106+
// try evolving
1107+
auto evolution = ArraySchemaEvolution(ctx);
1108+
evolution.drop_attribute("a1");
1109+
1110+
// should throw, schema must have at least one attribute
1111+
CHECK_THROWS(evolution.array_evolve(array_uri));
1112+
1113+
// load schema back should succeed
1114+
CHECK_NOTHROW(Array::load_schema(ctx, array_uri));
1115+
}
1116+
1117+
/**
1118+
* Add an enumeration which is not used by any attribute.
1119+
*
1120+
* FIXME: should this be an error?
1121+
* As far as I can tell the enumeration is undiscoverable if no attribute
1122+
* references it. It can be looked up by name but there is no
1123+
* API to list enumeration names.
1124+
*/
1125+
TEST_CASE("C++ API: SchemaEvolution add unused enumeration") {
1126+
test::VFSTestSetup vfs_test_setup;
1127+
Context ctx{vfs_test_setup.ctx()};
1128+
auto array_uri{
1129+
vfs_test_setup.array_uri("test_schema_evolution_add_unused_enumeration")};
1130+
1131+
// create array
1132+
ArraySchema schema = simple_schema(ctx);
1133+
Array::create(array_uri, schema);
1134+
test::DeleteArrayGuard guard(ctx.ptr().get(), array_uri.c_str());
1135+
1136+
// evolve
1137+
auto evolution = ArraySchemaEvolution(ctx);
1138+
Enumeration enumeration_in = Enumeration::create_empty(
1139+
ctx, "us_states", TILEDB_STRING_ASCII, tiledb::sm::constants::var_num);
1140+
evolution.add_enumeration(enumeration_in);
1141+
1142+
evolution.array_evolve(array_uri);
1143+
1144+
auto schema_out = Array::load_schema(ctx, array_uri);
1145+
1146+
Enumeration enumeration_out =
1147+
ArraySchemaExperimental::get_enumeration_from_name(
1148+
ctx, schema_out, "us_states");
1149+
1150+
CHECK(test::is_equivalent_enumeration(enumeration_in, enumeration_out));
1151+
}

test/src/unit-enumerations.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,7 +1639,7 @@ TEST_CASE_METHOD(
16391639

16401640
auto orig_schema = array->array_schema_latest_ptr();
16411641
auto ase = make_shared<ArraySchemaEvolution>(HERE(), memory_tracker_);
1642-
auto attr3 = make_shared<Attribute>(HERE(), "attr3", Datatype::UINT32);
1642+
auto attr3 = make_shared<Attribute>(HERE(), "attr4", Datatype::UINT32);
16431643
ase->add_attribute(attr3);
16441644
CHECK_NOTHROW(ase->evolve_schema(orig_schema));
16451645
}
@@ -1656,7 +1656,7 @@ TEST_CASE_METHOD(
16561656
auto enmr = create_enumeration(values);
16571657
ase->add_enumeration(enmr);
16581658

1659-
auto attr3 = make_shared<Attribute>(HERE(), "attr3", Datatype::UINT32);
1659+
auto attr3 = make_shared<Attribute>(HERE(), "attr4", Datatype::UINT32);
16601660
attr3->set_enumeration_name(default_enmr_name);
16611661
ase->add_attribute(attr3);
16621662

test/support/src/array_schema_helpers.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,55 @@
3131
*/
3232

3333
#include "test/support/src/array_schema_helpers.h"
34+
#include "tiledb/api/c_api/enumeration/enumeration_api_internal.h"
3435

3536
using namespace tiledb;
3637

3738
namespace tiledb::test {
3839

40+
bool is_equivalent_filter(
41+
const tiledb::Filter& left, const tiledb::Filter& right) {
42+
// FIXME if it ever matters: check options
43+
return left.filter_type() == right.filter_type();
44+
}
45+
46+
bool is_equivalent_filter_list(
47+
const tiledb::FilterList& left, const tiledb::FilterList& right) {
48+
if (left.nfilters() != right.nfilters()) {
49+
return false;
50+
}
51+
for (unsigned i = 0; i < left.nfilters(); i++) {
52+
if (!(is_equivalent_filter(left.filter(i), right.filter(i)))) {
53+
return false;
54+
}
55+
}
56+
return true;
57+
}
58+
59+
bool is_equivalent_attribute(
60+
const tiledb::Attribute& left, const tiledb::Attribute& right) {
61+
if (!(left.name() == right.name() && left.type() == right.type() &&
62+
left.cell_val_num() == right.cell_val_num() &&
63+
left.nullable() == right.nullable() &&
64+
is_equivalent_filter_list(left.filter_list(), right.filter_list()))) {
65+
return false;
66+
}
67+
68+
const void *leftfill, *rightfill;
69+
uint64_t leftfillsize, rightfillsize;
70+
uint8_t leftfillvalid, rightfillvalid;
71+
if (left.nullable()) {
72+
left.get_fill_value(&leftfill, &leftfillsize, &leftfillvalid);
73+
right.get_fill_value(&rightfill, &rightfillsize, &rightfillvalid);
74+
} else {
75+
left.get_fill_value(&leftfill, &leftfillsize);
76+
right.get_fill_value(&rightfill, &rightfillsize);
77+
leftfillvalid = rightfillvalid = 1;
78+
}
79+
return leftfillsize == rightfillsize && leftfillvalid == rightfillvalid &&
80+
(memcmp(leftfill, rightfill, leftfillsize) == 0);
81+
}
82+
3983
bool is_equivalent_enumeration(
4084
const sm::Enumeration& left, const sm::Enumeration& right) {
4185
return left.name() == right.name() && left.type() == right.type() &&
@@ -48,4 +92,10 @@ bool is_equivalent_enumeration(
4892
right.data().end());
4993
}
5094

95+
bool is_equivalent_enumeration(
96+
const Enumeration& left, const Enumeration& right) {
97+
return is_equivalent_enumeration(
98+
*left.ptr()->enumeration().get(), *right.ptr()->enumeration().get());
99+
}
100+
51101
} // namespace tiledb::test

test/support/src/array_schema_helpers.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,49 @@
3333
#ifndef TILEDB_TEST_ARRAY_SCHEMA_HELPERS_H
3434
#define TILEDB_TEST_ARRAY_SCHEMA_HELPERS_H
3535

36+
#include "tiledb/sm/array_schema/array_schema.h"
37+
#include "tiledb/sm/array_schema/attribute.h"
3638
#include "tiledb/sm/array_schema/enumeration.h"
39+
#include "tiledb/sm/cpp_api/tiledb"
40+
41+
#include "tiledb/sm/cpp_api/tiledb"
42+
#include "tiledb/sm/cpp_api/tiledb_experimental"
3743

3844
namespace tiledb::test {
3945

46+
/**
47+
* @return if two filters `left` and `right` represent the same transformations
48+
*/
49+
bool is_equivalent_filter(
50+
const tiledb::Filter& left, const tiledb::Filter& right);
51+
52+
/**
53+
* @return if two filter lists `left` and `right` have the same filters in the
54+
* same order
55+
*/
56+
bool is_equivalent_filter_list(
57+
const tiledb::FilterList& left, const tiledb::FilterList& right);
58+
59+
/**
60+
* @return if two attributes `left` and `right` are equivalent
61+
*/
62+
bool is_equivalent_attribute(
63+
const tiledb::Attribute& left, const tiledb::Attribute& right);
64+
4065
/**
4166
* @return if two enumerations `left` and `right` are equivalent,
4267
* i.e. have the same name, datatype, variants, etc
4368
*/
4469
bool is_equivalent_enumeration(
4570
const sm::Enumeration& left, const sm::Enumeration& right);
4671

72+
/**
73+
* @return if two enumerations `left` and `right` are equivalent,
74+
* i.e. have the same name, datatype, variants, etc
75+
*/
76+
bool is_equivalent_enumeration(
77+
const Enumeration& left, const Enumeration& right);
78+
4779
} // namespace tiledb::test
4880

4981
#endif

0 commit comments

Comments
 (0)