Skip to content

Commit f4b7f4b

Browse files
committed
feat: add PartitionSpec::Make and PartitionSpec::Validate
1 parent 7f7f85b commit f4b7f4b

File tree

6 files changed

+372
-2
lines changed

6 files changed

+372
-2
lines changed

src/iceberg/partition_spec.cc

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,19 @@
2020
#include "iceberg/partition_spec.h"
2121

2222
#include <algorithm>
23+
#include <cstdint>
2324
#include <format>
2425
#include <memory>
2526
#include <ranges>
27+
#include <unordered_map>
2628

29+
#include "iceberg/result.h"
2730
#include "iceberg/schema.h"
2831
#include "iceberg/schema_field.h"
2932
#include "iceberg/transform.h"
3033
#include "iceberg/util/formatter.h" // IWYU pragma: keep
3134
#include "iceberg/util/macros.h"
35+
#include "iceberg/util/type_util.h"
3236

3337
namespace iceberg {
3438

@@ -104,4 +108,68 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const {
104108
return spec_id_ == other.spec_id_ && fields_ == other.fields_;
105109
}
106110

111+
Status PartitionSpec::Validate(const Schema& schema, bool allowMissingFields) const {
112+
std::unordered_map<int32_t, int32_t> parents = indexParents(schema);
113+
for (const auto& partition_field : fields_) {
114+
ICEBERG_ASSIGN_OR_RAISE(auto source_field,
115+
schema.FindFieldById(partition_field.source_id()));
116+
// In the case the underlying field is dropped, we cannot check if they are compatible
117+
if (allowMissingFields && !source_field.has_value()) {
118+
continue;
119+
}
120+
auto field_transform = partition_field.transform();
121+
122+
// In the case of a Version 1 partition-spec field gets deleted, it is replaced with a
123+
// void transform, see: https://iceberg.apache.org/spec/#partition-transforms. We
124+
// don't care about the source type since a VoidTransform is always compatible and
125+
// skip the checks
126+
if (field_transform->transform_type() != TransformType::kVoid) {
127+
if (!source_field.has_value()) {
128+
return InvalidArgument("Cannot find source column for partition field: {}",
129+
partition_field);
130+
}
131+
const auto& source_type = source_field.value().get().type();
132+
if (!partition_field.transform()->CanTransform(*source_type)) {
133+
return InvalidArgument("Invalid source type {} for transform {}",
134+
source_type->ToString(),
135+
partition_field.transform()->ToString());
136+
}
137+
138+
// The only valid parent types for a PartitionField are StructTypes. This must be
139+
// checked recursively.
140+
auto parent_id_iter = parents.find(partition_field.source_id());
141+
while (parent_id_iter != parents.end()) {
142+
int32_t parent_id = parent_id_iter->second;
143+
ICEBERG_ASSIGN_OR_RAISE(auto parent_field, schema.FindFieldById(parent_id));
144+
if (!parent_field.has_value()) {
145+
return InvalidArgument("Cannot find parent field with ID: {}", parent_id);
146+
}
147+
const auto& parent_type = parent_field.value().get().type();
148+
if (parent_type->type_id() != TypeId::kStruct) {
149+
return InvalidArgument("Invalid partition field parent type: {}",
150+
parent_type->ToString());
151+
}
152+
parent_id_iter = parents.find(parent_id);
153+
}
154+
}
155+
}
156+
return {};
157+
}
158+
159+
Result<std::unique_ptr<PartitionSpec>> PartitionSpec::Make(
160+
const Schema& schema, int32_t spec_id, std::vector<PartitionField> fields,
161+
std::optional<int32_t> last_assigned_field_id, bool allowMissingFields) {
162+
auto partition_spec =
163+
std::make_unique<PartitionSpec>(spec_id, std::move(fields), last_assigned_field_id);
164+
ICEBERG_RETURN_UNEXPECTED(partition_spec->Validate(schema, allowMissingFields));
165+
return partition_spec;
166+
}
167+
168+
Result<std::unique_ptr<PartitionSpec>> PartitionSpec::Make(
169+
int32_t spec_id, std::vector<PartitionField> fields,
170+
std::optional<int32_t> last_assigned_field_id) {
171+
return std::make_unique<PartitionSpec>(spec_id, std::move(fields),
172+
last_assigned_field_id);
173+
}
174+
107175
} // namespace iceberg

src/iceberg/partition_spec.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,38 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable {
8080
return lhs.Equals(rhs);
8181
}
8282

83+
/// \brief Validates the partition spec against a schema.
84+
/// \param schema The schema to validate against.
85+
/// \param allowMissingFields Whether to skip validation for partition fields whose
86+
/// source columns have been dropped from the schema.
87+
/// \return Error status if the partition spec is invalid.
88+
Status Validate(const Schema& schema, bool allowMissingFields) const;
89+
90+
/// \brief Create a PartitionSpec binding to a schema.
91+
/// \param schema The schema to bind the partition spec to.
92+
/// \param spec_id The spec ID.
93+
/// \param fields The partition fields.
94+
/// \param last_assigned_field_id The last assigned field ID assigned to ensure new
95+
/// fields get unique IDs.
96+
/// \param allowMissingFields Whether to skip validation for partition fields whose
97+
/// source columns have been dropped from the schema.
98+
/// \return A Result containing the partition spec or an error.
99+
static Result<std::unique_ptr<PartitionSpec>> Make(
100+
const Schema& schema, int32_t spec_id, std::vector<PartitionField> fields,
101+
std::optional<int32_t> last_assigned_field_id = std::nullopt,
102+
bool allowMissingFields = false);
103+
104+
/// \brief Create a PartitionSpec without binding to a schema.
105+
/// \param spec_id The spec ID.
106+
/// \param fields The partition fields.
107+
/// \param last_assigned_field_id The last assigned field ID assigned to ensure new
108+
/// fields get unique IDs.
109+
/// \return A Result containing the partition spec or an error.
110+
/// \note This method does not check whether the sort fields are valid for any schema.
111+
static Result<std::unique_ptr<PartitionSpec>> Make(
112+
int32_t spec_id, std::vector<PartitionField> fields,
113+
std::optional<int32_t> last_assigned_field_id = std::nullopt);
114+
83115
private:
84116
/// \brief Compare two partition specs for equality.
85117
bool Equals(const PartitionSpec& other) const;

src/iceberg/sort_order.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ bool SortOrder::Equals(const SortOrder& other) const {
9090
Status SortOrder::Validate(const Schema& schema) const {
9191
for (const auto& field : fields_) {
9292
ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldById(field.source_id()));
93-
if (!schema_field.has_value() || schema_field == std::nullopt) {
93+
if (!schema_field.has_value()) {
9494
return InvalidArgument("Cannot find source column for sort field: {}", field);
9595
}
9696

src/iceberg/sort_order.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
8888
/// \param fields The sort fields.
8989
/// \return A Result containing the SortOrder or an error.
9090
/// \note This method does not check whether the sort fields are valid for any schema.
91-
/// Use IsBoundToSchema to check if the sort order is valid for a given schema.
9291
static Result<std::unique_ptr<SortOrder>> Make(int32_t sort_id,
9392
std::vector<SortField> fields);
9493

src/iceberg/test/partition_spec_test.cc

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,4 +152,174 @@ TEST(PartitionSpecTest, PartitionTypeTest) {
152152
EXPECT_EQ(pt_field3, partition_type.value()->fields()[2]);
153153
}
154154

155+
TEST(PartitionSpecTest, MissingSourceColumn) {
156+
SchemaField field1(1, "id", int64(), false);
157+
SchemaField field2(2, "ts", timestamp(), false);
158+
SchemaField field3(6, "s", string(), false);
159+
Schema schema({field1, field2, field3}, Schema::kInitialSchemaId);
160+
161+
// Try to create partition field with non-existent source column ID 999
162+
PartitionField pt_field_invalid(999, 1000, "missing_partition", Transform::Identity());
163+
164+
auto result = PartitionSpec::Make(schema, 1, {pt_field_invalid});
165+
EXPECT_FALSE(result.has_value());
166+
EXPECT_THAT(result.error().message,
167+
::testing::HasSubstr("Cannot find source column for partition field"));
168+
}
169+
170+
TEST(PartitionSpecTest, InvalidTransformForType) {
171+
// Test Day transform on string type (should fail)
172+
SchemaField field_string(6, "s", string(), false);
173+
Schema schema_string({field_string}, Schema::kInitialSchemaId);
174+
175+
PartitionField pt_field_invalid(6, 1005, "s_day", Transform::Day());
176+
177+
auto result_string = PartitionSpec::Make(schema_string, 1, {pt_field_invalid});
178+
EXPECT_FALSE(result_string.has_value());
179+
EXPECT_THAT(result_string.error().message, ::testing::HasSubstr("Invalid source type"));
180+
}
181+
182+
TEST(PartitionSpecTest, SourceIdNotFound) {
183+
SchemaField field1(1, "id", int64(), false);
184+
SchemaField field2(2, "ts", timestamp(), false);
185+
Schema schema({field1, field2}, Schema::kInitialSchemaId);
186+
187+
// Try to create partition field with source ID 99 which doesn't exist
188+
PartitionField pt_field_invalid(99, 1000, "Test", Transform::Identity());
189+
190+
auto result = PartitionSpec::Make(schema, 1, {pt_field_invalid});
191+
EXPECT_FALSE(result.has_value());
192+
EXPECT_THAT(result.error().message,
193+
::testing::HasSubstr("Cannot find source column for partition field"));
194+
}
195+
196+
TEST(PartitionSpecTest, PartitionFieldInStruct) {
197+
SchemaField field1(1, "id", int64(), false);
198+
SchemaField field2(2, "ts", timestamp(), false);
199+
Schema base_schema({field1, field2}, Schema::kInitialSchemaId);
200+
201+
// Create a struct that contains the base schema fields
202+
auto struct_type =
203+
std::make_shared<StructType>(std::vector<SchemaField>{field1, field2});
204+
SchemaField outer_struct(11, "MyStruct", struct_type, false);
205+
206+
Schema schema({outer_struct}, Schema::kInitialSchemaId);
207+
208+
// Partition on a field within the struct (id field with ID 1)
209+
PartitionField pt_field(1, 1000, "id_partition", Transform::Identity());
210+
211+
auto result = PartitionSpec::Make(schema, 1, {pt_field});
212+
EXPECT_TRUE(result.has_value()) << result.error().message;
213+
}
214+
215+
TEST(PartitionSpecTest, PartitionFieldInStructInStruct) {
216+
SchemaField field1(1, "id", int64(), false);
217+
SchemaField field2(2, "ts", timestamp(), false);
218+
219+
// Create inner struct
220+
auto inner_struct =
221+
std::make_shared<StructType>(std::vector<SchemaField>{field1, field2});
222+
SchemaField inner_field(11, "Inner", inner_struct, false);
223+
224+
// Create outer struct containing inner struct
225+
auto outer_struct = std::make_shared<StructType>(std::vector<SchemaField>{inner_field});
226+
SchemaField outer_field(12, "Outer", outer_struct, true);
227+
228+
Schema schema({outer_field}, Schema::kInitialSchemaId);
229+
230+
// Partition on a field deep within nested structs (id field with ID 1)
231+
PartitionField pt_field(1, 1000, "id_partition", Transform::Identity());
232+
233+
auto result = PartitionSpec::Make(schema, 1, {pt_field});
234+
EXPECT_TRUE(result.has_value()) << result.error().message;
235+
}
236+
237+
TEST(PartitionSpecTest, PartitionFieldInList) {
238+
auto list_type = std::make_shared<ListType>(1, int32(), /*element_required=*/false);
239+
SchemaField list_field(2, "MyList", list_type, false);
240+
241+
Schema schema({list_field}, Schema::kInitialSchemaId);
242+
243+
// Try to partition on the list element field (field ID 1 is the element)
244+
PartitionField pt_field(1, 1000, "element_partition", Transform::Identity());
245+
246+
auto result = PartitionSpec::Make(schema, 1, {pt_field});
247+
EXPECT_FALSE(result.has_value());
248+
EXPECT_THAT(result.error().message,
249+
::testing::HasSubstr("Invalid partition field parent"));
250+
}
251+
252+
TEST(PartitionSpecTest, PartitionFieldInStructInList) {
253+
auto struct_in_list = std::make_shared<StructType>(
254+
std::vector<SchemaField>{SchemaField(1, "Foo", int32(), true)});
255+
auto list_type = std::make_shared<ListType>(2, struct_in_list,
256+
/*element_required=*/false);
257+
SchemaField list_field(3, "MyList", list_type, false);
258+
259+
Schema schema({list_field}, Schema::kInitialSchemaId);
260+
261+
// Try to partition on a field within a struct inside a list (Foo field with ID 1)
262+
PartitionField pt_field(1, 1000, "foo_partition", Transform::Identity());
263+
264+
auto result = PartitionSpec::Make(schema, 1, {pt_field});
265+
EXPECT_FALSE(result.has_value());
266+
EXPECT_THAT(result.error().message,
267+
::testing::HasSubstr("Invalid partition field parent"));
268+
}
269+
270+
TEST(PartitionSpecTest, PartitionFieldInMap) {
271+
SchemaField key_field(1, "key", int32(), false);
272+
SchemaField value_field(2, "value", int32(), false);
273+
auto map_type = std::make_shared<MapType>(key_field, value_field);
274+
SchemaField map_field(3, "MyMap", map_type, false);
275+
276+
Schema schema({map_field}, Schema::kInitialSchemaId);
277+
278+
// Try to partition on the map key field (field ID 1 is the key)
279+
PartitionField pt_field_key(1, 1000, "key_partition", Transform::Identity());
280+
281+
auto result_key = PartitionSpec::Make(schema, 1, {pt_field_key});
282+
EXPECT_FALSE(result_key.has_value());
283+
EXPECT_THAT(result_key.error().message,
284+
::testing::HasSubstr("Invalid partition field parent"));
285+
286+
// Try to partition on the map value field (field ID 2 is the value)
287+
PartitionField pt_field_value(2, 1001, "value_partition", Transform::Identity());
288+
289+
auto result_value = PartitionSpec::Make(schema, 1, {pt_field_value});
290+
EXPECT_FALSE(result_value.has_value());
291+
EXPECT_THAT(result_value.error().message,
292+
::testing::HasSubstr("Invalid partition field parent"));
293+
}
294+
295+
TEST(PartitionSpecTest, PartitionFieldInStructInMap) {
296+
auto struct_key = std::make_shared<StructType>(
297+
std::vector<SchemaField>{SchemaField(1, "Foo", int32(), true)});
298+
auto struct_value = std::make_shared<StructType>(
299+
std::vector<SchemaField>{SchemaField(2, "Bar", int32(), true)});
300+
301+
SchemaField key_field(3, "key", struct_key, false);
302+
SchemaField value_field(4, "value", struct_value, false);
303+
auto map_type = std::make_shared<MapType>(key_field, value_field);
304+
SchemaField map_field(5, "MyMap", map_type, false);
305+
306+
Schema schema({map_field}, Schema::kInitialSchemaId);
307+
308+
// Try to partition on a field within the key struct (Foo field with ID 1)
309+
PartitionField pt_field_key(1, 1000, "foo_partition", Transform::Identity());
310+
311+
auto result_key = PartitionSpec::Make(schema, 1, {pt_field_key});
312+
EXPECT_FALSE(result_key.has_value());
313+
EXPECT_THAT(result_key.error().message,
314+
::testing::HasSubstr("Invalid partition field parent"));
315+
316+
// Try to partition on a field within the value struct (Bar field with ID 2)
317+
PartitionField pt_field_value(2, 1001, "bar_partition", Transform::Identity());
318+
319+
auto result_value = PartitionSpec::Make(schema, 1, {pt_field_value});
320+
EXPECT_FALSE(result_value.has_value());
321+
EXPECT_THAT(result_value.error().message,
322+
::testing::HasSubstr("Invalid partition field parent"));
323+
}
324+
155325
} // namespace iceberg

0 commit comments

Comments
 (0)