Skip to content

Commit 18fbe06

Browse files
authored
Merge pull request #29590 from nvartolomei/nv/iceberg-json-schema-oneof
iceberg/json_schema: support oneOf for nullable types
2 parents 46e0fb7 + afb0000 commit 18fbe06

File tree

5 files changed

+711
-8
lines changed

5 files changed

+711
-8
lines changed

src/v/iceberg/conversion/ir_json.cc

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,13 @@ class type_set {
4949
static type_set none() { return type_set{}; }
5050

5151
void set(json_type t) { bits_.set(index(t)); }
52+
void intersect(const type_set& other) { bits_ &= other.bits_; }
5253
bool test(json_type t) const { return bits_.test(index(t)); }
5354

55+
bool is_null_only() const {
56+
return bits_.count() == 1 && bits_.test(index(json_type::null));
57+
}
58+
5459
/// Returns the single non-null type if exactly one is set.
5560
std::optional<json_type> single_non_null_type() const {
5661
auto copy = bits_;
@@ -113,12 +118,56 @@ struct constraint {
113118

114119
// Object properties, keyed by name.
115120
std::map<std::string, constraint> properties = {};
121+
bool additional_properties_allowed = true;
116122

117123
// Array item constraints.
118124
// - nullopt: no "items" keyword
119125
// - empty vector: "items": []
120126
// - non-empty: item schema(s) to validate
121127
std::optional<std::vector<constraint>> items = std::nullopt;
128+
129+
conversion_outcome<void> intersect(const constraint& other) {
130+
types.intersect(other.types);
131+
132+
if (format.has_value() && other.format.has_value()) {
133+
if (format != other.format) {
134+
return conversion_exception(
135+
"Conflicting format annotations across branches");
136+
}
137+
} else if (other.format.has_value()) {
138+
format = other.format;
139+
}
140+
141+
if (!properties.empty() && !other.properties.empty()) {
142+
return conversion_exception(
143+
"Intersecting constraints with properties on both sides "
144+
"is not supported");
145+
} else if (!other.properties.empty()) {
146+
if (!additional_properties_allowed) {
147+
return conversion_exception(
148+
"additionalProperties: false conflicts with properties "
149+
"defined in another branch");
150+
}
151+
properties = other.properties;
152+
} else if (
153+
!properties.empty() && !other.additional_properties_allowed) {
154+
return conversion_exception(
155+
"additionalProperties: false conflicts with properties "
156+
"defined in another branch");
157+
}
158+
additional_properties_allowed = additional_properties_allowed
159+
&& other.additional_properties_allowed;
160+
161+
if (items.has_value() && other.items.has_value()) {
162+
return conversion_exception(
163+
"Intersecting constraints with items on both sides is not "
164+
"supported");
165+
} else if (other.items.has_value()) {
166+
items = other.items;
167+
}
168+
169+
return outcome::success();
170+
}
122171
};
123172

124173
/// Context for the resolution phase.
@@ -149,6 +198,48 @@ collect(collect_context& ctx, const conversion::json_schema::subschema&);
149198

150199
conversion_outcome<field_type> resolve(resolution_context&, const constraint&);
151200

201+
// This is not full fidelity oneOf support - only T|null is supported.
202+
// In the general case, oneOf is a XOR over multiple schemas. For T|null
203+
// it is reduced to OR.
204+
conversion_outcome<constraint> collect_one_of_t_xor_null(
205+
collect_context& ctx,
206+
const conversion::json_schema::const_list_view& one_of) {
207+
constexpr std::string_view unsupported_one_of_msg
208+
= "oneOf keyword is supported only for exclusive T|null structures";
209+
if (one_of.size() != 2) {
210+
return conversion_exception(std::string{unsupported_one_of_msg});
211+
}
212+
213+
auto c1 = collect(ctx, one_of.at(0));
214+
if (c1.has_error()) {
215+
return c1.error();
216+
}
217+
auto c2 = collect(ctx, one_of.at(1));
218+
if (c2.has_error()) {
219+
return c2.error();
220+
}
221+
222+
// Accept only exclusive oneOf(T, null):
223+
// - exactly one branch is null-only
224+
// - the non-null branch must not include null
225+
const bool c1_is_null_only = c1.value().types.is_null_only();
226+
const bool c2_is_null_only = c2.value().types.is_null_only();
227+
228+
if (c1_is_null_only == c2_is_null_only) {
229+
return conversion_exception(std::string{unsupported_one_of_msg});
230+
}
231+
232+
const constraint& non_null_branch = c1_is_null_only ? c2.value()
233+
: c1.value();
234+
if (non_null_branch.types.test(json_type::null)) {
235+
return conversion_exception(std::string{unsupported_one_of_msg});
236+
}
237+
238+
auto c = non_null_branch;
239+
c.types.set(json_type::null);
240+
return c;
241+
}
242+
152243
/// Collect item constraints from a JSON Schema array.
153244
conversion_outcome<std::optional<std::vector<constraint>>> collect_items(
154245
collect_context& ctx, const conversion::json_schema::subschema& s) {
@@ -232,12 +323,14 @@ collect(collect_context& ctx, const conversion::json_schema::subschema& s) {
232323
c.properties[name] = std::move(prop_constraint.value());
233324
}
234325

235-
if (
236-
s.additional_properties()
237-
&& s.additional_properties()->get().boolean_subschema() != false) {
238-
return conversion_exception(
239-
"Only 'false' subschema is supported "
240-
"for additionalProperties keyword");
326+
if (s.additional_properties()) {
327+
if (s.additional_properties()->get().boolean_subschema() == false) {
328+
c.additional_properties_allowed = false;
329+
} else {
330+
return conversion_exception(
331+
"Only 'false' subschema is supported "
332+
"for additionalProperties keyword");
333+
}
241334
}
242335
}
243336

@@ -250,6 +343,17 @@ collect(collect_context& ctx, const conversion::json_schema::subschema& s) {
250343
c.items = std::move(items_result.value());
251344
}
252345

346+
if (!s.one_of().empty()) {
347+
auto one_of_constraint = collect_one_of_t_xor_null(ctx, s.one_of());
348+
if (one_of_constraint.has_error()) {
349+
return one_of_constraint.error();
350+
}
351+
auto intersect_result = c.intersect(one_of_constraint.value());
352+
if (intersect_result.has_error()) {
353+
return intersect_result.error();
354+
}
355+
}
356+
253357
return c;
254358
}
255359

src/v/iceberg/conversion/json_schema/frontend.cc

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ constexpr auto banned_keywords = std::to_array({
300300
"else",
301301
"allOf",
302302
"anyOf",
303-
"oneOf",
304303
});
305304

306305
}; // namespace
@@ -525,6 +524,12 @@ class frontend::frontend_impl {
525524
}
526525
}
527526

527+
if (
528+
const auto one_of_node = find_keyword(
529+
node, "oneOf", {json_value_type::array})) {
530+
compile_one_of(ctx, *sub, *one_of_node);
531+
}
532+
528533
if (
529534
const auto format_node = find_keyword(
530535
node, "format", {json_value_type::string})) {
@@ -657,6 +662,33 @@ class frontend::frontend_impl {
657662
"The items keyword must be an object or an array of objects");
658663
}
659664
}
665+
666+
void compile_one_of(
667+
compile_context& ctx, subschema& sub, const json::Value& node) const {
668+
vassert(
669+
sub.one_of_.empty(),
670+
"oneOf subschema vector should be empty at this point");
671+
672+
if (!node.IsArray() || node.GetArray().Empty()) {
673+
throw std::runtime_error(
674+
"The oneOf keyword must be a non-empty array");
675+
}
676+
677+
std::vector<ss::shared_ptr<subschema>> one_of_subschemas;
678+
one_of_subschemas.reserve(node.GetArray().Size());
679+
for (const auto& item : node.GetArray()) {
680+
one_of_subschemas.push_back(compile_subschema(ctx, item));
681+
}
682+
683+
for (size_t i = 0; i < one_of_subschemas.size(); ++i) {
684+
auto& one_of_subschema = one_of_subschemas[i];
685+
auto [it, inserted] = sub.subschemas_.emplace(
686+
fmt::format("oneOf/{}", i), one_of_subschema);
687+
vassert(inserted, "unique insertion should have succeeded");
688+
}
689+
690+
sub.one_of_ = std::move(one_of_subschemas);
691+
}
660692
};
661693

662694
frontend::frontend()

src/v/iceberg/conversion/json_schema/ir.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ class subschema {
303303
: std::nullopt;
304304
}
305305

306+
/// \return Subschemas in the "oneOf" keyword if specified.
307+
const_list_view one_of() const { return const_list_view(one_of_); }
308+
306309
/// \return Format of this schema object if specified.
307310
/// See https://www.learnjsonschema.com/2020-12/validation/format
308311
const std::optional<format>& format() const { return format_; }
@@ -333,6 +336,8 @@ class subschema {
333336
items_t items_;
334337
ss::shared_ptr<subschema> additional_items_;
335338

339+
std::vector<ss::shared_ptr<subschema>> one_of_;
340+
336341
std::optional<enum format> format_;
337342

338343
std::optional<std::string> ref_value_;

src/v/iceberg/conversion/json_schema/tests/frontend_test.cc

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,87 @@ TEST(frontend_test, non_object_or_boolean_subschema) {
923923
StrEq("Subschema must be an object or a boolean")));
924924
}
925925

926+
TEST(frontend_test, one_of) {
927+
auto schema = frontend{}.compile(
928+
parse_json(R"({
929+
"$id": "https://example.com/root.json",
930+
"oneOf": [
931+
{ "type": "null" },
932+
{ "type": "string" },
933+
{ "type": "integer" }
934+
]
935+
})"),
936+
"https://example.com/irrelevant-base.json",
937+
dialect::draft7);
938+
939+
auto expected = R"(# (document root)
940+
base uri: https://example.com/root.json
941+
dialect: http://json-schema.org/draft-07/schema#
942+
#/oneOf/0
943+
base uri: https://example.com/root.json
944+
dialect: http://json-schema.org/draft-07/schema#
945+
types: [null]
946+
#/oneOf/1
947+
base uri: https://example.com/root.json
948+
dialect: http://json-schema.org/draft-07/schema#
949+
types: [string]
950+
#/oneOf/2
951+
base uri: https://example.com/root.json
952+
dialect: http://json-schema.org/draft-07/schema#
953+
types: [integer]
954+
)";
955+
956+
ASSERT_EQ(expected, ir_tree_printer::to_string(schema));
957+
ASSERT_EQ(schema.root().one_of().size(), 3);
958+
}
959+
960+
TEST(frontend_test, duplicate_one_of) {
961+
// Not required by json schema but we disallow duplicate keywords to reduce
962+
// ambiguity.
963+
EXPECT_THAT(
964+
[]() {
965+
frontend{}.compile(
966+
parse_json(R"({
967+
"$id": "https://example.com/root.json",
968+
"oneOf": [{ "type": "string" }],
969+
"oneOf": [{ "type": "integer" }]
970+
})"),
971+
"https://example.com/irrelevant-base.json",
972+
dialect::draft7);
973+
},
974+
ThrowsMessage<std::runtime_error>(StrEq("Duplicate keyword: oneOf")));
975+
}
976+
977+
TEST(frontend_test, one_of_empty_array) {
978+
EXPECT_THAT(
979+
[]() {
980+
frontend{}.compile(
981+
parse_json(R"({
982+
"$id": "https://example.com/root.json",
983+
"oneOf": []
984+
})"),
985+
"https://example.com/irrelevant-base.json",
986+
dialect::draft7);
987+
},
988+
ThrowsMessage<std::runtime_error>(
989+
StrEq("The oneOf keyword must be a non-empty array")));
990+
}
991+
992+
TEST(frontend_test, one_of_non_array) {
993+
EXPECT_THAT(
994+
[]() {
995+
frontend{}.compile(
996+
parse_json(R"({
997+
"$id": "https://example.com/root.json",
998+
"oneOf": "not an array"
999+
})"),
1000+
"https://example.com/irrelevant-base.json",
1001+
dialect::draft7);
1002+
},
1003+
ThrowsMessage<std::runtime_error>(
1004+
StrEq("Invalid type for keyword oneOf. Expected one of: [array].")));
1005+
}
1006+
9261007
TEST(frontend_test, non_string_dialect) {
9271008
EXPECT_THAT(
9281009
[]() {

0 commit comments

Comments
 (0)