diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b65b5287f..3f5056211 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -137,4 +137,4 @@ jobs: meson compile -C builddir - name: Test Iceberg run: | - meson test -C builddir + meson test -C builddir --timeout-multiplier=2 diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 275d71fce..385bd8726 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -72,6 +72,7 @@ set(ICEBERG_SOURCES transform.cc transform_function.cc type.cc + update/update_partition_spec.cc update/update_properties.cc util/bucket_util.cc util/conversions.cc diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index c139c66b5..ae095850f 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -94,6 +94,7 @@ iceberg_sources = files( 'transform.cc', 'transform_function.cc', 'type.cc', + 'update/update_partition_spec.cc', 'update/update_properties.cc', 'util/bucket_util.cc', 'util/conversions.cc', @@ -193,7 +194,6 @@ install_headers( 'transform.h', 'type_fwd.h', 'type.h', - 'update/update_properties.h', ], subdir: 'iceberg', ) diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc index 458711255..21c183cb9 100644 --- a/src/iceberg/table.cc +++ b/src/iceberg/table.cc @@ -26,6 +26,7 @@ #include "iceberg/table_metadata.h" #include "iceberg/table_properties.h" #include "iceberg/table_scan.h" +#include "iceberg/update/update_partition_spec.h" #include "iceberg/update/update_properties.h" #include "iceberg/util/macros.h" @@ -113,6 +114,10 @@ std::unique_ptr Table::UpdateProperties() const { return std::make_unique(identifier_, catalog_, metadata_); } +std::unique_ptr Table::UpdateSpec() { + return std::make_unique(identifier_, catalog_, metadata_); +} + std::unique_ptr Table::NewTransaction() const { throw NotImplemented("Table::NewTransaction is not implemented"); } diff --git a/src/iceberg/table.h b/src/iceberg/table.h index df3a0c32e..8bb9a4ccc 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -110,6 +110,11 @@ class ICEBERG_EXPORT Table { /// \return a new UpdateProperties instance virtual std::unique_ptr UpdateProperties() const; + /// \brief Create a new UpdatePartitionSpec to alter the partition spec of this table + /// and commit the changes. + /// \return a pointer to the new UpdatePartitionSpec + virtual std::unique_ptr UpdateSpec(); + /// \brief Create a new table scan builder for this table /// /// Once a table scan builder is created, it can be refined to project columns and diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 9892e3d4f..63119337a 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -77,6 +77,7 @@ add_iceberg_test(table_test table_requirement_test.cc table_requirements_test.cc table_update_test.cc + update_partition_spec_test.cc update_properties_test.cc) add_iceberg_test(expression_test diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index c73abe188..60856b872 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -53,6 +53,7 @@ iceberg_tests = { 'table_requirement_test.cc', 'table_test.cc', 'table_update_test.cc', + 'update_partition_spec_test.cc', 'update_properties_test.cc', ), }, diff --git a/src/iceberg/test/update_partition_spec_test.cc b/src/iceberg/test/update_partition_spec_test.cc new file mode 100644 index 000000000..aea293a7d --- /dev/null +++ b/src/iceberg/test/update_partition_spec_test.cc @@ -0,0 +1,897 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/update_partition_spec.h" + +#include +#include +#include +#include + +#include +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/sort_order.h" +#include "iceberg/table.h" +#include "iceberg/table_identifier.h" +#include "iceberg/table_metadata.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/mock_catalog.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { + +// Test schema matching Java test +std::shared_ptr CreateTestSchema() { + return std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeRequired(2, "ts", timestamp_tz()), + SchemaField::MakeRequired(3, "category", string()), + SchemaField::MakeOptional(4, "data", string())}, + 0); +} + +// Create partitioned spec matching Java test +std::shared_ptr CreatePartitionedSpec() { + ICEBERG_ASSIGN_OR_THROW( + auto spec_result, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + return spec_result; +} + +// Create base metadata for testing +std::shared_ptr CreateBaseMetadata(int8_t format_version, + std::shared_ptr spec) { + auto metadata = std::make_shared(); + metadata->format_version = format_version; + metadata->table_uuid = "test-uuid-1234"; + metadata->location = "s3://bucket/test"; + metadata->last_sequence_number = 0; + metadata->last_updated_ms = TimePointMs{std::chrono::milliseconds(1000)}; + metadata->last_column_id = 4; + metadata->current_schema_id = 0; + metadata->schemas.push_back(CreateTestSchema()); + metadata->partition_specs.push_back(spec); + metadata->default_spec_id = spec->spec_id(); + metadata->last_partition_id = spec->last_assigned_field_id(); + metadata->current_snapshot_id = Snapshot::kInvalidSnapshotId; + metadata->default_sort_order_id = SortOrder::kInitialSortOrderId; + metadata->sort_orders.push_back(SortOrder::Unsorted()); + metadata->next_row_id = TableMetadata::kInitialRowId; + return metadata; +} + +// Helper to create UpdatePartitionSpec +std::unique_ptr CreateUpdatePartitionSpec( + int8_t format_version, std::shared_ptr base_spec) { + auto catalog = std::make_shared(); + auto metadata = CreateBaseMetadata(format_version, base_spec); + TableIdentifier identifier{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + return std::make_unique(std::move(identifier), catalog, metadata); +} + +// Helper to assert partition spec equality +void AssertPartitionSpecEquals(const PartitionSpec& expected, + const PartitionSpec& actual) { + ASSERT_EQ(expected.fields().size(), actual.fields().size()); + for (size_t i = 0; i < expected.fields().size(); ++i) { + const auto& expected_field = expected.fields()[i]; + const auto& actual_field = actual.fields()[i]; + EXPECT_EQ(expected_field.source_id(), actual_field.source_id()); + EXPECT_EQ(expected_field.field_id(), actual_field.field_id()); + EXPECT_EQ(expected_field.name(), actual_field.name()); + EXPECT_EQ(*expected_field.transform(), *actual_field.transform()); + } +} + +} // namespace + +class UpdatePartitionSpecTest : public ::testing::TestWithParam { + protected: + void SetUp() override { + schema_ = CreateTestSchema(); + unpartitioned_ = PartitionSpec::Unpartitioned(); + partitioned_ = CreatePartitionedSpec(); + format_version_ = GetParam(); + } + + std::shared_ptr schema_; + std::shared_ptr unpartitioned_; + std::shared_ptr partitioned_; + int8_t format_version_; +}; + +INSTANTIATE_TEST_SUITE_P(FormatVersions, UpdatePartitionSpecTest, ::testing::Values(1, 2), + [](const ::testing::TestParamInfo& info) { + return std::format("V{}", info.param); + }); + +TEST_P(UpdatePartitionSpecTest, TestAddIdentityByName) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField("category"); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 3, 1000, "category", Transform::Identity())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddIdentityByTerm) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + auto ref = Expressions::Ref("category"); + update->AddField(ref); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 3, 1000, "category", Transform::Identity())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddYear) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Year("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1000, "ts_year", Transform::Year())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddMonth) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Month("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1000, "ts_month", Transform::Month())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddDay) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Day("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL(auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 2, 1000, "ts_day", Transform::Day())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddHour) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Hour("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1000, "ts_hour", Transform::Hour())})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddBucket) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Bucket("id", 16)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 1, 1000, "id_bucket_16", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddTruncate) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField(Expressions::Truncate("data", 4)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 4, 1000, "data_trunc_4", Transform::Truncate(4))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddNamedPartition) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField("shard", Expressions::Bucket("id", 16)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(1, 1000, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddToExisting) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField(Expressions::Truncate("data", 4)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16)), + PartitionField(4, 1003, "data_trunc_4", Transform::Truncate(4))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestMultipleAdds) { + auto update = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update->AddField("category") + .AddField(Expressions::Day("ts")) + .AddField("shard", Expressions::Bucket("id", 16)) + .AddField("prefix", Expressions::Truncate("data", 4)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16)), + PartitionField(4, 1003, "prefix", Transform::Truncate(4))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestAddHourToDay) { + // First add day partition + auto update1 = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update1->AddField(Expressions::Day("ts")); + ASSERT_THAT(update1->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto by_day_spec, update1->GetAppliedSpec()); + + // Then add hour partition + auto metadata = CreateBaseMetadata(format_version_, by_day_spec); + auto catalog = std::make_shared(); + TableIdentifier identifier{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update2 = std::make_unique(identifier, catalog, metadata); + update2->AddField(Expressions::Hour("ts")); + ASSERT_THAT(update2->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto by_hour_spec, update2->GetAppliedSpec()); + + ASSERT_EQ(by_hour_spec->fields().size(), 2); + EXPECT_EQ(by_hour_spec->fields()[0].source_id(), 2); + EXPECT_EQ(by_hour_spec->fields()[0].name(), "ts_day"); + EXPECT_EQ(*by_hour_spec->fields()[0].transform(), *Transform::Day()); + EXPECT_EQ(by_hour_spec->fields()[1].source_id(), 2); + EXPECT_EQ(by_hour_spec->fields()[1].name(), "ts_hour"); + EXPECT_EQ(*by_hour_spec->fields()[1].transform(), *Transform::Hour()); +} + +TEST_P(UpdatePartitionSpecTest, TestAddMultipleBuckets) { + // First add bucket 16 + auto update1 = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update1->AddField(Expressions::Bucket("id", 16)); + ASSERT_THAT(update1->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto bucket16_spec, update1->GetAppliedSpec()); + + // Then add bucket 8 + auto metadata = CreateBaseMetadata(format_version_, bucket16_spec); + auto catalog = std::make_shared(); + TableIdentifier identifier{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update2 = std::make_unique(identifier, catalog, metadata); + update2->AddField(Expressions::Bucket("id", 8)); + ASSERT_THAT(update2->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto bucket8_spec, update2->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(1, 1000, "id_bucket_16", Transform::Bucket(16)), + PartitionField(1, 1001, "id_bucket_8", Transform::Bucket(8))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *bucket8_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveIdentityByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField("category"); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + // V1: deleted fields are replaced with void transform + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Void()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + // V2: deleted fields are removed + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveBucketByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField("shard"); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + // V1: deleted fields are replaced with void transform + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Void())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + // V2: deleted fields are removed + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveIdentityByEquivalent) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + auto ref = Expressions::Ref("category"); + update->RemoveField(ref); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Void()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveDayByEquivalent) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField(Expressions::Day("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Void()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(1, 1002, "shard", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveBucketByEquivalent) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField(Expressions::Bucket("id", 16)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Void())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRename) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RenameField("shard", "id_bucket"); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "id_bucket", Transform::Bucket(16))})); + auto expected = std::shared_ptr(expected_spec.release()); + + AssertPartitionSpecEquals(*expected, *updated_spec); +} + +TEST_P(UpdatePartitionSpecTest, TestMultipleChanges) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RenameField("shard", "id_bucket") + .RemoveField(Expressions::Day("ts")) + .AddField("prefix", Expressions::Truncate("data", 4)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Void()), + PartitionField(1, 1002, "id_bucket", Transform::Bucket(16)), + PartitionField(4, 1003, "prefix", Transform::Truncate(4))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(1, 1002, "id_bucket", Transform::Bucket(16)), + PartitionField(4, 1003, "prefix", Transform::Truncate(4))})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestAddDeletedName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField(Expressions::Bucket("id", 16)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day()), + PartitionField(1, 1002, "shard", Transform::Void())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make( + PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(3, 1000, "category", Transform::Identity()), + PartitionField(2, 1001, "ts_day", Transform::Day())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveNewlyAddedFieldByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("prefix", Expressions::Truncate("data", 4)); + update->RemoveField("prefix"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot delete newly added field")); +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveNewlyAddedFieldByTransform) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("prefix", Expressions::Truncate("data", 4)); + update->RemoveField(Expressions::Truncate("data", 4)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot delete newly added field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddAlreadyAddedFieldByTransform) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("prefix", Expressions::Truncate("data", 4)); + update->AddField(Expressions::Truncate("data", 4)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddAlreadyAddedFieldByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("prefix", Expressions::Truncate("data", 4)); + update->AddField("prefix", Expressions::Truncate("data", 6)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddRedundantTimePartition) { + // Test day + hour conflict + auto update1 = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update1->AddField(Expressions::Day("ts")); + update1->AddField(Expressions::Hour("ts")); + EXPECT_THAT(update1->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update1->Apply(), HasErrorMessage("Cannot add redundant partition field")); + + // Test hour + month conflict after adding hour to existing day + auto update2 = CreateUpdatePartitionSpec(format_version_, partitioned_); + update2->AddField(Expressions::Hour("ts")); // day already exists, so hour is OK + update2->AddField(Expressions::Month("ts")); // conflicts with hour + EXPECT_THAT(update2->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update2->Apply(), HasErrorMessage("Cannot add redundant partition")); +} + +TEST_P(UpdatePartitionSpecTest, TestNoEffectAddDeletedSameFieldWithSameName) { + auto update1 = CreateUpdatePartitionSpec(format_version_, partitioned_); + update1->RemoveField("shard"); + update1->AddField("shard", Expressions::Bucket("id", 16)); + ASSERT_THAT(update1->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto spec1, update1->GetAppliedSpec()); + AssertPartitionSpecEquals(*partitioned_, *spec1); + + auto update2 = CreateUpdatePartitionSpec(format_version_, partitioned_); + update2->RemoveField("shard"); + update2->AddField(Expressions::Bucket("id", 16)); + ASSERT_THAT(update2->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto spec2, update2->GetAppliedSpec()); + AssertPartitionSpecEquals(*partitioned_, *spec2); +} + +TEST_P(UpdatePartitionSpecTest, TestGenerateNewSpecAddDeletedSameFieldWithDifferentName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField("shard"); + update->AddField("new_shard", Expressions::Bucket("id", 16)); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + ASSERT_EQ(updated_spec->fields().size(), 3); + EXPECT_EQ(updated_spec->fields()[0].name(), "category"); + EXPECT_EQ(updated_spec->fields()[1].name(), "ts_day"); + EXPECT_EQ(updated_spec->fields()[2].name(), "new_shard"); + EXPECT_EQ(*updated_spec->fields()[0].transform(), *Transform::Identity()); + EXPECT_EQ(*updated_spec->fields()[1].transform(), *Transform::Day()); + EXPECT_EQ(*updated_spec->fields()[2].transform(), *Transform::Bucket(16)); +} + +TEST_P(UpdatePartitionSpecTest, TestAddDuplicateByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("category"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddDuplicateByRef) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + auto ref = Expressions::Ref("category"); + update->AddField(ref); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddDuplicateTransform) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField(Expressions::Bucket("id", 16)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestAddNamedDuplicate) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("b16", Expressions::Bucket("id", 16)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot add duplicate partition field")); +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveUnknownFieldByName) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField("moon"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot find partition field to remove")); +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveUnknownFieldByEquivalent) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField(Expressions::Hour("ts")); // day(ts) exists, not hour + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), HasErrorMessage("Cannot find partition field to remove")); +} + +TEST_P(UpdatePartitionSpecTest, TestRenameUnknownField) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RenameField("shake", "seal"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), + HasErrorMessage("Cannot find partition field to rename: shake")); +} + +TEST_P(UpdatePartitionSpecTest, TestRenameAfterAdd) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->AddField("data_trunc", Expressions::Truncate("data", 4)); + update->RenameField("data_trunc", "prefix"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), + HasErrorMessage("Cannot rename newly added partition field: data_trunc")); +} + +TEST_P(UpdatePartitionSpecTest, TestRenameAndDelete) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RenameField("shard", "id_bucket"); + update->RemoveField(Expressions::Bucket("id", 16)); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), + HasErrorMessage("Cannot rename and delete partition field: shard")); +} + +TEST_P(UpdatePartitionSpecTest, TestDeleteAndRename) { + auto update = CreateUpdatePartitionSpec(format_version_, partitioned_); + update->RemoveField(Expressions::Bucket("id", 16)); + update->RenameField("shard", "id_bucket"); + EXPECT_THAT(update->Apply(), IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(update->Apply(), + HasErrorMessage("Cannot delete and rename partition field: shard")); +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveAndAddMultiTimes) { + // Add first time + auto update1 = CreateUpdatePartitionSpec(format_version_, unpartitioned_); + update1->AddField("ts_date", Expressions::Day("ts")); + ASSERT_THAT(update1->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto add_first_time_spec, update1->GetAppliedSpec()); + + // Remove first time + auto metadata1 = CreateBaseMetadata(format_version_, add_first_time_spec); + auto catalog1 = std::make_shared(); + TableIdentifier identifier1{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update2 = std::make_unique(identifier1, catalog1, metadata1); + update2->RemoveField(Expressions::Day("ts")); + ASSERT_THAT(update2->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto remove_first_time_spec, update2->GetAppliedSpec()); + + // Add second time + auto metadata2 = CreateBaseMetadata(format_version_, remove_first_time_spec); + auto catalog2 = std::make_shared(); + TableIdentifier identifier2{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update3 = std::make_unique(identifier2, catalog2, metadata2); + update3->AddField("ts_date", Expressions::Day("ts")); + ASSERT_THAT(update3->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto add_second_time_spec, update3->GetAppliedSpec()); + + // Remove second time + auto metadata3 = CreateBaseMetadata(format_version_, add_second_time_spec); + auto catalog3 = std::make_shared(); + TableIdentifier identifier3{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update4 = std::make_unique(identifier3, catalog3, metadata3); + update4->RemoveField(Expressions::Day("ts")); + ASSERT_THAT(update4->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto remove_second_time_spec, update4->GetAppliedSpec()); + + // Add third time with month + auto metadata4 = CreateBaseMetadata(format_version_, remove_second_time_spec); + auto catalog4 = std::make_shared(); + TableIdentifier identifier4{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update5 = std::make_unique(identifier4, catalog4, metadata4); + update5->AddField(Expressions::Month("ts")); + ASSERT_THAT(update5->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto add_third_time_spec, update5->GetAppliedSpec()); + + // Rename ts_month to ts_date + auto metadata5 = CreateBaseMetadata(format_version_, add_third_time_spec); + auto catalog5 = std::make_shared(); + TableIdentifier identifier5{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update6 = std::make_unique(identifier5, catalog5, metadata5); + update6->RenameField("ts_month", "ts_date"); + ASSERT_THAT(update6->Apply(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update6->GetAppliedSpec()); + + if (format_version_ == 1) { + ASSERT_EQ(updated_spec->fields().size(), 3); + // In V1, we expect void transforms for deleted fields + EXPECT_TRUE(updated_spec->fields()[0].name().find("ts_date") == 0); + EXPECT_TRUE(updated_spec->fields()[1].name().find("ts_date") == 0); + EXPECT_EQ(updated_spec->fields()[2].name(), "ts_date"); + EXPECT_EQ(*updated_spec->fields()[0].transform(), *Transform::Void()); + EXPECT_EQ(*updated_spec->fields()[1].transform(), *Transform::Void()); + EXPECT_EQ(*updated_spec->fields()[2].transform(), *Transform::Month()); + } else { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{ + PartitionField(2, 1000, "ts_date", Transform::Month())})); + auto expected = std::shared_ptr(expected_spec.release()); + AssertPartitionSpecEquals(*expected, *updated_spec); + } +} + +TEST_P(UpdatePartitionSpecTest, TestRemoveAndUpdateWithDifferentTransformation) { + ICEBERG_UNWRAP_OR_FAIL( + auto expected_spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + std::vector{PartitionField( + 2, 1000, "ts_transformed", Transform::Month())})); + auto expected = std::shared_ptr(expected_spec.release()); + auto metadata = CreateBaseMetadata(format_version_, expected); + auto catalog = std::make_shared(); + TableIdentifier identifier{.ns = Namespace{.levels = {"test"}}, .name = "test_table"}; + auto update = std::make_unique(identifier, catalog, metadata); + update->RemoveField("ts_transformed"); + update->AddField("ts_transformed", Expressions::Day("ts")); + ASSERT_THAT(update->Apply(), IsOk()); + + ICEBERG_UNWRAP_OR_FAIL(auto updated_spec, update->GetAppliedSpec()); + + if (format_version_ == 1) { + ASSERT_EQ(updated_spec->fields().size(), 2); + EXPECT_TRUE(updated_spec->fields()[0].name().find("ts_transformed") == 0); + EXPECT_EQ(updated_spec->fields()[1].name(), "ts_transformed"); + EXPECT_EQ(*updated_spec->fields()[0].transform(), *Transform::Void()); + EXPECT_EQ(*updated_spec->fields()[1].transform(), *Transform::Day()); + } else { + ASSERT_EQ(updated_spec->fields().size(), 1); + EXPECT_EQ(updated_spec->fields()[0].name(), "ts_transformed"); + EXPECT_EQ(*updated_spec->fields()[0].transform(), *Transform::Day()); + } +} + +} // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 0e1867f60..e5355d2a1 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -124,6 +124,10 @@ class Literal; class BoundPredicate; class UnboundPredicate; +class BoundReference; +class BoundTransform; +template +class UnboundTerm; class DataTableScan; class FileScanTask; @@ -160,9 +164,8 @@ class TableMetadataBuilder; class TableUpdateContext; class PendingUpdate; -template -class PendingUpdateTyped; class UpdateProperties; +class UpdatePartitionSpec; /// ---------------------------------------------------------------------------- /// TODO: Forward declarations below are not added yet. diff --git a/src/iceberg/update/meson.build b/src/iceberg/update/meson.build new file mode 100644 index 000000000..95e86dc1f --- /dev/null +++ b/src/iceberg/update/meson.build @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + ['update_partition_spec.h', 'update_properties.h'], + subdir: 'iceberg/update', +) diff --git a/src/iceberg/update/update_partition_spec.cc b/src/iceberg/update/update_partition_spec.cc new file mode 100644 index 000000000..4b736a531 --- /dev/null +++ b/src/iceberg/update/update_partition_spec.cc @@ -0,0 +1,608 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/update_partition_spec.h" + +#include + +#include "iceberg/catalog.h" +#include "iceberg/expression/term.h" +#include "iceberg/partition_field.h" +#include "iceberg/partition_spec.h" +#include "iceberg/result.h" +#include "iceberg/schema.h" +#include "iceberg/table.h" +#include "iceberg/table_identifier.h" +#include "iceberg/table_metadata.h" +#include "iceberg/table_requirements.h" +#include "iceberg/table_update.h" +#include "iceberg/transform.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +UpdatePartitionSpec::UpdatePartitionSpec(TableIdentifier identifier, + std::shared_ptr catalog, + std::shared_ptr base) + : identifier_(std::move(identifier)), + catalog_(std::move(catalog)), + base_metadata_(std::move(base)) { + ICEBERG_DCHECK(catalog_, "Catalog is required to construct UpdatePartitionSpec"); + ICEBERG_DCHECK(base_metadata_, + "Base table metadata is required to construct UpdatePartitionSpec"); + format_version_ = base_metadata_->format_version; + + // Get the current/default partition spec + auto spec_result = base_metadata_->PartitionSpec(); + if (!spec_result.has_value()) { + AddError(spec_result.error()); + return; + } + spec_ = std::move(spec_result.value()); + + // Get the current schema + auto schema_result = base_metadata_->Schema(); + if (!schema_result.has_value()) { + AddError(schema_result.error()); + return; + } + schema_ = std::move(schema_result.value()); + + last_assigned_partition_id_ = spec_->last_assigned_field_id(); + name_to_field_ = IndexSpecByName(*spec_); + transform_to_field_ = IndexSpecByTransform(*spec_); + + // Check for unknown transforms + for (const auto& field : spec_->fields()) { + if (field.transform()->transform_type() == TransformType::kUnknown) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot update partition spec with unknown transform: {}", + field.ToString())); + return; + } + } +} + +UpdatePartitionSpec::~UpdatePartitionSpec() = default; + +UpdatePartitionSpec& UpdatePartitionSpec::CaseSensitive(bool is_case_sensitive) { + case_sensitive_ = is_case_sensitive; + return *this; +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddNonDefaultSpec() { + set_as_default_ = false; + return *this; +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddField(const std::string& source_name) { + // Find the source field in the schema + auto field_result = schema_->FindFieldByName(source_name, case_sensitive_); + if (!field_result.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot find source field: {}", source_name)); + return *this; + } + + auto field_opt = field_result.value(); + if (!field_opt.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot find source field: {}", source_name)); + return *this; + } + + int32_t source_id = field_opt.value().get().field_id(); + return AddFieldInternal(nullptr, source_id, Transform::Identity()); +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddField( + std::shared_ptr> term) { + return AddField(std::nullopt, std::move(term)); +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddField( + std::shared_ptr> term) { + return AddField(std::nullopt, std::move(term)); +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddField( + std::optional name, std::shared_ptr> term) { + // Bind the term to get the source field + auto bound_result = term->Bind(*schema_, case_sensitive_); + if (!bound_result.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot bind term: {}", term->ToString())); + return *this; + } + + auto bound_ref = bound_result.value(); + int32_t source_id = bound_ref->field().field_id(); + + // Reference terms use identity transform + return AddFieldInternal(name ? &name.value() : nullptr, source_id, + Transform::Identity()); +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddField( + std::optional name, std::shared_ptr> term) { + // Bind the term to get the source field and transform + auto bound_result = term->Bind(*schema_, case_sensitive_); + if (!bound_result.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot bind term: {}", term->ToString())); + return *this; + } + + auto bound_transform = bound_result.value(); + int32_t source_id = bound_transform->reference()->field().field_id(); + auto transform = bound_transform->transform(); + + return AddFieldInternal(name ? &name.value() : nullptr, source_id, transform); +} + +UpdatePartitionSpec& UpdatePartitionSpec::AddFieldInternal( + const std::string* name, int32_t source_id, std::shared_ptr transform) { + // Check for duplicate name in added fields + if (name != nullptr) { + auto it = name_to_added_field_.find(*name); + if (it != name_to_added_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot add duplicate partition field: {}", *name)); + return *this; + } + } + + TransformKey validation_key{source_id, transform->ToString()}; + + // Check if this field already exists in the current spec + auto existing_it = transform_to_field_.find(validation_key); + if (existing_it != transform_to_field_.end()) { + const auto& existing = existing_it->second; + if (deletes_.contains(existing.field_id()) && *existing.transform() == *transform) { + // If the field was deleted and we're re-adding the same one, just undo the delete + return RewriteDeleteAndAddField(existing, name); + } + + if (deletes_.find(existing.field_id()) == deletes_.end()) { + AddError( + ErrorKind::kInvalidArgument, + std::format( + "Cannot add duplicate partition field for source {} with transform {}, " + "conflicts with {}", + source_id, transform->ToString(), existing.ToString())); + return *this; + } + } + + // Check if already being added + auto added_it = transform_to_added_field_.find(validation_key); + if (added_it != transform_to_added_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format( + "Cannot add duplicate partition field for source {} with transform {}, " + "already added: {}", + source_id, transform->ToString(), added_it->second.ToString())); + return *this; + } + + // Create or recycle the partition field + PartitionField new_field = RecycleOrCreatePartitionField(source_id, transform, name); + + // Generate name if not provided + std::string field_name; + if (name != nullptr) { + field_name = *name; + } else { + field_name = GeneratePartitionName(source_id, transform); + } + + // Create the final field with the name + new_field = PartitionField(new_field.source_id(), new_field.field_id(), field_name, + new_field.transform()); + + // Check for redundant time-based partitions + CheckForRedundantAddedPartitions(new_field); + + transform_to_added_field_.emplace(validation_key, new_field); + + // Handle name conflicts with existing fields + auto existing_name_it = name_to_field_.find(field_name); + if (existing_name_it != name_to_field_.end()) { + const auto& existing_field = existing_name_it->second; + if (!deletes_.contains(existing_field.field_id())) { + if (IsVoidTransform(existing_field)) { + // Rename the old deleted field + std::string renamed = + std::format("{}_{}", existing_field.name(), existing_field.field_id()); + renames_[std::string(existing_field.name())] = renamed; + } else { + AddError( + ErrorKind::kInvalidArgument, + std::format("Cannot add duplicate partition field name: {}", field_name)); + return *this; + } + } else { + // Field is being deleted, rename it to avoid conflict + std::string renamed = + std::format("{}_{}", existing_field.name(), existing_field.field_id()); + renames_[std::string(existing_field.name())] = renamed; + } + } + + name_to_added_field_.emplace(field_name, new_field); + adds_.push_back(new_field); + + return *this; +} + +UpdatePartitionSpec& UpdatePartitionSpec::RewriteDeleteAndAddField( + const PartitionField& existing, const std::string* name) { + deletes_.erase(existing.field_id()); + if (name == nullptr || std::string(existing.name()) == *name) { + return *this; + } + return RenameField(std::string(existing.name()), *name); +} + +UpdatePartitionSpec& UpdatePartitionSpec::RemoveField(const std::string& name) { + // Cannot delete newly added fields + auto added_it = name_to_added_field_.find(name); + if (added_it != name_to_added_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot delete newly added field: {}", name)); + return *this; + } + + // Cannot rename and delete + if (renames_.find(name) != renames_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot rename and delete partition field: {}", name)); + return *this; + } + + auto field_it = name_to_field_.find(name); + if (field_it == name_to_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot find partition field to remove: {}", name)); + return *this; + } + + deletes_.insert(field_it->second.field_id()); + return *this; +} + +UpdatePartitionSpec& UpdatePartitionSpec::RemoveField( + std::shared_ptr> term) { + // Bind the term to get the source field + auto bound_result = term->Bind(*schema_, case_sensitive_); + if (!bound_result.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot bind term: {}", term->ToString())); + return *this; + } + + auto bound_ref = bound_result.value(); + int32_t source_id = bound_ref->field().field_id(); + + // Reference terms use identity transform + TransformKey key{source_id, Transform::Identity()->ToString()}; + return RemoveFieldByTransform(key, term->ToString()); +} + +UpdatePartitionSpec& UpdatePartitionSpec::RemoveField( + std::shared_ptr> term) { + // Bind the term to get the source field and transform + auto bound_result = term->Bind(*schema_, case_sensitive_); + if (!bound_result.has_value()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot bind term: {}", term->ToString())); + return *this; + } + + auto bound_transform = bound_result.value(); + int32_t source_id = bound_transform->reference()->field().field_id(); + auto transform = bound_transform->transform(); + + TransformKey key{source_id, transform->ToString()}; + return RemoveFieldByTransform(key, term->ToString()); +} + +UpdatePartitionSpec& UpdatePartitionSpec::RemoveFieldByTransform( + const TransformKey& key, const std::string& term_str) { + // Cannot delete newly added fields + auto added_it = transform_to_added_field_.find(key); + if (added_it != transform_to_added_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot delete newly added field: {}", term_str)); + return *this; + } + + auto field_it = transform_to_field_.find(key); + if (field_it == transform_to_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot find partition field to remove: {}", term_str)); + return *this; + } + + const auto& field = field_it->second; + // Cannot rename and delete + if (renames_.find(std::string(field.name())) != renames_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot rename and delete partition field: {}", field.name())); + return *this; + } + + deletes_.insert(field.field_id()); + return *this; +} + +UpdatePartitionSpec& UpdatePartitionSpec::RenameField(const std::string& name, + const std::string& new_name) { + // Handle existing void field with the new name + auto existing_it = name_to_field_.find(new_name); + if (existing_it != name_to_field_.end() && IsVoidTransform(existing_it->second)) { + std::string renamed = + std::format("{}_{}", existing_it->second.name(), existing_it->second.field_id()); + renames_[new_name] = renamed; + } + + // Cannot rename newly added fields + auto added_it = name_to_added_field_.find(name); + if (added_it != name_to_added_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot rename newly added partition field: {}", name)); + return *this; + } + + auto field_it = name_to_field_.find(name); + if (field_it == name_to_field_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot find partition field to rename: {}", name)); + return *this; + } + + // Cannot delete and rename + if (deletes_.contains(field_it->second.field_id())) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot delete and rename partition field: {}", name)); + return *this; + } + + renames_[name] = new_name; + return *this; +} + +Status UpdatePartitionSpec::Apply() { + ICEBERG_RETURN_UNEXPECTED(CheckErrors()); + + std::vector new_fields; + + // Process existing fields + for (const auto& field : spec_->fields()) { + if (!deletes_.contains(field.field_id())) { + // Field is kept, check for rename + auto rename_it = renames_.find(std::string(field.name())); + if (rename_it != renames_.end()) { + new_fields.emplace_back(field.source_id(), field.field_id(), rename_it->second, + field.transform()); + } else { + new_fields.push_back(field); + } + } else if (format_version_ < 2) { + // In V1, deleted fields are replaced with void transform + auto rename_it = renames_.find(std::string(field.name())); + std::string field_name = + rename_it != renames_.end() ? rename_it->second : std::string(field.name()); + new_fields.emplace_back(field.source_id(), field.field_id(), field_name, + Transform::Void()); + } + // In V2, deleted fields are simply removed + } + + // Add new fields + for (const auto& new_field : adds_) { + new_fields.push_back(new_field); + } + + // Determine the new spec ID + int32_t new_spec_id = spec_ ? spec_->spec_id() + 1 : PartitionSpec::kInitialSpecId; + + // In V2, if all fields are removed, reset last_assigned_partition_id to allow + // field IDs to restart from 1000 when fields are added again + int32_t last_assigned_id = last_assigned_partition_id_; + if (format_version_ >= 2 && new_fields.empty()) { + last_assigned_id = PartitionSpec::kLegacyPartitionDataIdStart - 1; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto spec_result, PartitionSpec::Make(*schema_, new_spec_id, std::move(new_fields), + last_assigned_id)); + applied_spec_ = std::shared_ptr(spec_result.release()); + return {}; +} + +Result> UpdatePartitionSpec::GetAppliedSpec() const { + if (!applied_spec_) { + return InvalidArgument("Apply() must be called successfully before getting the spec"); + } + return applied_spec_; +} + +Status UpdatePartitionSpec::Commit() { + // Apply the changes first + ICEBERG_RETURN_UNEXPECTED(Apply()); + + ICEBERG_ASSIGN_OR_RAISE(auto spec_result, GetAppliedSpec()); + std::shared_ptr new_spec = spec_result; + + std::vector> updates; + + // Add the new partition spec + updates.emplace_back(std::make_unique(new_spec)); + + // If set_as_default_ is true, set this spec as the default + if (set_as_default_) { + updates.emplace_back( + std::make_unique(new_spec->spec_id())); + } + + ICEBERG_ASSIGN_OR_RAISE(auto requirements, + TableRequirements::ForUpdateTable(*base_metadata_, updates)); + ICEBERG_RETURN_UNEXPECTED(catalog_->UpdateTable(identifier_, requirements, updates)); + + return {}; +} + +int32_t UpdatePartitionSpec::AssignFieldId() { return ++last_assigned_partition_id_; } + +PartitionField UpdatePartitionSpec::RecycleOrCreatePartitionField( + int32_t source_id, std::shared_ptr transform, const std::string* name) { + // In V2+, search historical specs for a matching field to recycle + if (format_version_ >= 2) { + // Collect all fields from all historical partition specs + std::vector all_historical_fields; + for (const auto& partition_spec : base_metadata_->partition_specs) { + for (const auto& field : partition_spec->fields()) { + all_historical_fields.push_back(field); + } + } + + // Search for a matching field + for (const auto& field : all_historical_fields) { + if (field.source_id() == source_id && *field.transform() == *transform) { + // If target name is specified then consider it too, otherwise not + if (name == nullptr || std::string(field.name()) == *name) { + return field; + } + } + } + } + // No matching field found, create a new one + std::string field_name = name ? *name : ""; + return {source_id, AssignFieldId(), field_name, transform}; +} + +std::string UpdatePartitionSpec::GeneratePartitionName( + int32_t source_id, const std::shared_ptr& transform) const { + // Find the source field name + auto field_result = schema_->FindFieldById(source_id); + std::string source_name = "unknown"; + if (field_result.has_value() && field_result.value().has_value()) { + source_name = std::string(field_result.value().value().get().name()); + } + + // Extract parameter from transform string for bucket and truncate + // Transform::ToString() returns "bucket[16]" or "truncate[4]" format + std::string transform_str = transform->ToString(); + + switch (transform->transform_type()) { + case TransformType::kIdentity: + return source_name; + case TransformType::kBucket: { + // Parse "bucket[N]" to extract N + // Format: sourceName_bucket_N (matching Java: sourceName + "_bucket_" + numBuckets) + size_t open_bracket = transform_str.find('['); + size_t close_bracket = transform_str.find(']'); + if (open_bracket != std::string::npos && close_bracket != std::string::npos) { + std::string param_str = + transform_str.substr(open_bracket + 1, close_bracket - open_bracket - 1); + return std::format("{}_{}_{}", source_name, "bucket", param_str); + } + return std::format("{}_bucket", source_name); + } + case TransformType::kTruncate: { + // Parse "truncate[N]" to extract N + // Format: sourceName_trunc_N (matching Java: sourceName + "_trunc_" + width) + size_t open_bracket = transform_str.find('['); + size_t close_bracket = transform_str.find(']'); + if (open_bracket != std::string::npos && close_bracket != std::string::npos) { + std::string param_str = + transform_str.substr(open_bracket + 1, close_bracket - open_bracket - 1); + return std::format("{}_{}_{}", source_name, "trunc", param_str); + } + return std::format("{}_trunc", source_name); + } + case TransformType::kYear: + return std::format("{}_year", source_name); + case TransformType::kMonth: + return std::format("{}_month", source_name); + case TransformType::kDay: + return std::format("{}_day", source_name); + case TransformType::kHour: + return std::format("{}_hour", source_name); + case TransformType::kVoid: + return std::format("{}_null", source_name); + case TransformType::kUnknown: + return std::format("{}_unknown", source_name); + } + std::unreachable(); +} + +bool UpdatePartitionSpec::IsTimeTransform(const std::shared_ptr& transform) { + switch (transform->transform_type()) { + case TransformType::kYear: + case TransformType::kMonth: + case TransformType::kDay: + case TransformType::kHour: + return true; + default: + return false; + } +} + +bool UpdatePartitionSpec::IsVoidTransform(const PartitionField& field) { + return field.transform()->transform_type() == TransformType::kVoid; +} + +void UpdatePartitionSpec::CheckForRedundantAddedPartitions(const PartitionField& field) { + if (HasErrors()) return; + + if (IsTimeTransform(field.transform())) { + auto it = added_time_fields_.find(field.source_id()); + if (it != added_time_fields_.end()) { + AddError(ErrorKind::kInvalidArgument, + std::format("Cannot add redundant partition field: {} conflicts with {}", + field.ToString(), it->second.ToString())); + return; + } + added_time_fields_.emplace(field.source_id(), field); + } +} + +std::unordered_map UpdatePartitionSpec::IndexSpecByName( + const PartitionSpec& spec) { + std::unordered_map index; + for (const auto& field : spec.fields()) { + index.emplace(std::string(field.name()), field); + } + return index; +} + +std::unordered_map +UpdatePartitionSpec::IndexSpecByTransform(const PartitionSpec& spec) { + std::unordered_map index; + for (const auto& field : spec.fields()) { + TransformKey key{field.source_id(), field.transform()->ToString()}; + index.emplace(key, field); + } + return index; +} + +} // namespace iceberg diff --git a/src/iceberg/update/update_partition_spec.h b/src/iceberg/update/update_partition_spec.h new file mode 100644 index 000000000..e13b2e2f7 --- /dev/null +++ b/src/iceberg/update/update_partition_spec.h @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/update_partition_spec.h +/// API for partition spec evolution. + +#include +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/pending_update.h" +#include "iceberg/result.h" +#include "iceberg/table_identifier.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief API for partition spec evolution. +/// +/// When committing, these changes will be applied to the current table metadata. +/// Commit conflicts will not be resolved and will result in a CommitFailed error. +class ICEBERG_EXPORT UpdatePartitionSpec : public PendingUpdate { + public: + /// \brief Construct an UpdatePartitionSpec for the specified table. + /// + /// \param identifier The table identifier. + /// \param catalog The catalog. + /// \param base The base table metadata. + UpdatePartitionSpec(TableIdentifier identifier, std::shared_ptr catalog, + std::shared_ptr base); + + ~UpdatePartitionSpec() override; + + /// \brief Set whether column resolution in the source schema should be case sensitive. + UpdatePartitionSpec& CaseSensitive(bool is_case_sensitive); + + /// \brief Add a new partition field from a source column. + /// + /// The partition field will be created as an identity partition field for the given + /// source column, with the same name as the source column. + /// + /// \param source_name Source column name in the table schema. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddField(const std::string& source_name); + + /// \brief Add a new partition field from an unbound term. + /// + /// The partition field will use the term's transform or the identity transform if + /// the term is a reference. + /// + /// \param term The unbound term representing the partition transform. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddField(std::shared_ptr> term); + + /// \brief Add a new partition field from an unbound transform term. + /// + /// \param term The unbound transform term. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddField(std::shared_ptr> term); + + /// \brief Add a new partition field with a custom name. + /// + /// \param name Name for the partition field. + /// \param term The unbound term representing the partition transform. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddField(std::optional name, + std::shared_ptr> term); + + /// \brief Add a new partition field with a custom name from an unbound transform. + /// + /// \param name Name for the partition field. + /// \param term The unbound transform term. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddField(std::optional name, + std::shared_ptr> term); + + /// \brief Remove a partition field by name. + /// + /// \param name Name of the partition field to remove. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& RemoveField(const std::string& name); + + /// \brief Remove a partition field by its transform term. + /// + /// The partition field with the same transform and source reference will be removed. + /// If the term is a reference and does not have a transform, the identity transform + /// is used. + /// + /// \param term The unbound term representing the partition transform to remove. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& RemoveField(std::shared_ptr> term); + + /// \brief Remove a partition field by its transform term. + /// + /// The partition field with the same transform and source reference will be removed. + /// + /// \param term The unbound transform term. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& RemoveField(std::shared_ptr> term); + + /// \brief Rename a field in the partition spec. + /// + /// \param name Name of the partition field to rename. + /// \param new_name Replacement name for the partition field. + /// \return Reference to this for method chaining. + UpdatePartitionSpec& RenameField(const std::string& name, const std::string& new_name); + + /// \brief Sets that the new partition spec will NOT be set as the default. + /// + /// The default behavior is to set the new spec as the default partition spec. + /// + /// \return Reference to this for method chaining. + UpdatePartitionSpec& AddNonDefaultSpec(); + + /// \brief Apply the pending changes and validate them. + /// + /// The resulting partition spec can be retrieved using GetAppliedSpec() after + /// a successful Apply(). + /// + /// \return Status::OK if the changes are valid, or an error. + Status Apply() override; + + /// \brief Get the applied partition spec after a successful Apply(). + /// + /// \return The applied partition spec, or an error if Apply() hasn't been called + /// successfully. + Result> GetAppliedSpec() const; + + /// \brief Apply and commit the pending changes to the table. + /// + /// \return Status::OK if the commit was successful, or an error. + Status Commit() override; + + private: + /// \brief Pair of source ID and transform string for indexing. + using TransformKey = std::pair; + + /// \brief Hash function for TransformKey. + struct TransformKeyHash { + size_t operator()(const TransformKey& key) const { + return std::hash{}(key.first) ^ + (std::hash{}(key.second) << 1); + } + }; + + /// \brief Assign a new partition field ID. + int32_t AssignFieldId(); + + /// \brief Recycle or create a partition field. + /// + /// In V2, searches for a similar partition field in historical specs. + /// If not found or in V1, creates a new PartitionField. + PartitionField RecycleOrCreatePartitionField(int32_t source_id, + std::shared_ptr transform, + const std::string* name); + + /// \brief Internal implementation of AddField with resolved source ID and transform. + UpdatePartitionSpec& AddFieldInternal(const std::string* name, int32_t source_id, + std::shared_ptr transform); + + /// \brief Generate a partition field name from the source and transform. + std::string GeneratePartitionName(int32_t source_id, + const std::shared_ptr& transform) const; + + /// \brief Check if a transform is a time-based transform. + static bool IsTimeTransform(const std::shared_ptr& transform); + + /// \brief Check if a partition field uses void transform. + static bool IsVoidTransform(const PartitionField& field); + + /// \brief Check for redundant time-based partition fields. + void CheckForRedundantAddedPartitions(const PartitionField& field); + + /// \brief Handle rewriting a delete-and-add operation for the same field. + UpdatePartitionSpec& RewriteDeleteAndAddField(const PartitionField& existing, + const std::string* name); + + /// \brief Internal helper to remove a field by transform key. + UpdatePartitionSpec& RemoveFieldByTransform(const TransformKey& key, + const std::string& term_str); + + /// \brief Index the spec fields by name. + static std::unordered_map IndexSpecByName( + const PartitionSpec& spec); + + /// \brief Index the spec fields by (source_id, transform) pair. + static std::unordered_map + IndexSpecByTransform(const PartitionSpec& spec); + + TableIdentifier identifier_; + std::shared_ptr catalog_; + std::shared_ptr base_metadata_; + + // Configuration + int32_t format_version_; + std::shared_ptr spec_; + std::shared_ptr schema_; + bool case_sensitive_{true}; + bool set_as_default_{true}; + int32_t last_assigned_partition_id_; + + // Indexes for existing fields + std::unordered_map name_to_field_; + std::unordered_map transform_to_field_; + + // Pending changes + std::vector adds_; + std::unordered_map added_time_fields_; + std::unordered_map + transform_to_added_field_; + std::unordered_map name_to_added_field_; + std::unordered_set deletes_; + std::unordered_map renames_; + + // Applied result + std::shared_ptr applied_spec_; +}; + +} // namespace iceberg diff --git a/src/iceberg/update/update_properties.cc b/src/iceberg/update/update_properties.cc index a4dcd1548..3421b24f2 100644 --- a/src/iceberg/update/update_properties.cc +++ b/src/iceberg/update/update_properties.cc @@ -42,6 +42,8 @@ UpdateProperties::UpdateProperties(TableIdentifier identifier, catalog_(std::move(catalog)), base_metadata_(std::move(base)) {} +UpdateProperties::~UpdateProperties() = default; + UpdateProperties& UpdateProperties::Set(const std::string& key, const std::string& value) { if (removals_.contains(key)) { diff --git a/src/iceberg/update/update_properties.h b/src/iceberg/update/update_properties.h index 0f1adf76a..5f5c2ef30 100644 --- a/src/iceberg/update/update_properties.h +++ b/src/iceberg/update/update_properties.h @@ -43,6 +43,8 @@ class ICEBERG_EXPORT UpdateProperties : public PendingUpdate { UpdateProperties(TableIdentifier identifier, std::shared_ptr catalog, std::shared_ptr base); + ~UpdateProperties() override; + /// \brief Sets a property key to a specified value. /// /// The key can not be marked for previous removal and reserved property keys will be