Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ set(ICEBERG_SOURCES
transform.cc
transform_function.cc
type.cc
update/expire_snapshots.cc
update/pending_update.cc
update/update_partition_spec.cc
update/update_properties.cc
Expand Down
1 change: 1 addition & 0 deletions src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ iceberg_sources = files(
'transform.cc',
'transform_function.cc',
'type.cc',
'update/expire_snapshots.cc',
'update/pending_update.cc',
'update/update_partition_spec.cc',
'update/update_properties.cc',
Expand Down
8 changes: 8 additions & 0 deletions src/iceberg/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "iceberg/table_properties.h"
#include "iceberg/table_scan.h"
#include "iceberg/transaction.h"
#include "iceberg/update/expire_snapshots.h"
#include "iceberg/update/update_partition_spec.h"
#include "iceberg/update/update_properties.h"
#include "iceberg/update/update_schema.h"
Expand Down Expand Up @@ -179,6 +180,13 @@ Result<std::shared_ptr<UpdateSchema>> Table::NewUpdateSchema() {
return transaction->NewUpdateSchema();
}

Result<std::shared_ptr<ExpireSnapshots>> Table::NewExpireSnapshots() {
ICEBERG_ASSIGN_OR_RAISE(
auto transaction, Transaction::Make(shared_from_this(), Transaction::Kind::kUpdate,
/*auto_commit=*/true));
return transaction->NewExpireSnapshots();
}

Result<std::shared_ptr<StagedTable>> StagedTable::Make(
TableIdentifier identifier, std::shared_ptr<TableMetadata> metadata,
std::string metadata_location, std::shared_ptr<FileIO> io,
Expand Down
4 changes: 4 additions & 0 deletions src/iceberg/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ class ICEBERG_EXPORT Table : public std::enable_shared_from_this<Table> {
/// changes.
virtual Result<std::shared_ptr<UpdateSchema>> NewUpdateSchema();

/// \brief Create a new ExpireSnapshots to remove expired snapshots and commit the
/// changes.
virtual Result<std::shared_ptr<ExpireSnapshots>> NewExpireSnapshots();

protected:
Table(TableIdentifier identifier, std::shared_ptr<TableMetadata> metadata,
std::string metadata_location, std::shared_ptr<FileIO> io,
Expand Down
224 changes: 219 additions & 5 deletions src/iceberg/table_metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,12 @@ class TableMetadataBuilder::Impl {
Result<int32_t> AddSchema(const Schema& schema, int32_t new_last_column_id);
void SetLocation(std::string_view location);

Status SetRef(const std::string& name, std::shared_ptr<SnapshotRef> ref);
Status RemoveRef(const std::string& name);
Status AddSnapshot(std::shared_ptr<Snapshot> snapshot);
Status RemoveSnapshots(const std::vector<int64_t>& snapshot_ids);
Status RemovePartitionSpecs(const std::vector<int32_t>& spec_ids);

Result<std::unique_ptr<TableMetadata>> Build();

private:
Expand Down Expand Up @@ -1077,6 +1083,209 @@ int32_t TableMetadataBuilder::Impl::ReuseOrCreateNewSchemaId(
return new_schema_id;
}

Status TableMetadataBuilder::Impl::SetRef(const std::string& name,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#408 is adding SetRef and AddSnapshot. Perhaps we should wait for it to be merged and leave them as not implemented for now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. This pr depends on the implement of SetRef and AddSnapshot, I will rebase code after #408 to avoiding conflict

std::shared_ptr<SnapshotRef> ref) {
// Check if the ref already exists and is equal to the new ref
auto existing_ref_it = metadata_.refs.find(name);
if (existing_ref_it != metadata_.refs.end() && *existing_ref_it->second == *ref) {
// No change needed
return {};
}

// Validate that the snapshot exists
int64_t snapshot_id = ref->snapshot_id;
auto snapshot_it =
std::ranges::find_if(metadata_.snapshots, [snapshot_id](const auto& snapshot) {
return snapshot != nullptr && snapshot->snapshot_id == snapshot_id;
});
ICEBERG_PRECHECK(snapshot_it != metadata_.snapshots.end(),
"Cannot set {} to unknown snapshot: {}", name, snapshot_id);

// Check if this is an added snapshot (in the current set of changes)
bool is_added_snapshot =
std::ranges::any_of(changes_, [snapshot_id](const auto& change) {
return change->kind() == TableUpdate::Kind::kAddSnapshot &&
internal::checked_cast<const table::AddSnapshot&>(*change)
.snapshot()
->snapshot_id == snapshot_id;
});

if (is_added_snapshot) {
metadata_.last_updated_ms = (*snapshot_it)->timestamp_ms;
}

// Handle main branch specially
if (name == SnapshotRef::kMainBranch) {
metadata_.current_snapshot_id = ref->snapshot_id;
if (metadata_.last_updated_ms == kInvalidLastUpdatedMs) {
metadata_.last_updated_ms =
TimePointMs{std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())};
}

metadata_.snapshot_log.emplace_back(metadata_.last_updated_ms, ref->snapshot_id);
}

// Update the refs map
metadata_.refs[name] = ref;

// Record the change
if (ref->type() == SnapshotRefType::kBranch) {
auto retention = std::get<SnapshotRef::Branch>(ref->retention);
changes_.push_back(std::make_unique<table::SetSnapshotRef>(
name, ref->snapshot_id, ref->type(), retention.min_snapshots_to_keep,
retention.max_snapshot_age_ms, retention.max_ref_age_ms));
} else {
auto retention = std::get<SnapshotRef::Tag>(ref->retention);
changes_.push_back(std::make_unique<table::SetSnapshotRef>(
name, ref->snapshot_id, ref->type(), std::nullopt, std::nullopt,
retention.max_ref_age_ms));
}

return {};
}

Status TableMetadataBuilder::Impl::RemoveRef(const std::string& name) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not align with the Java impl:

    public Builder removeRef(String name) {
      if (SnapshotRef.MAIN_BRANCH.equals(name)) {
        this.currentSnapshotId = -1;
      }

      SnapshotRef ref = refs.remove(name);
      if (ref != null) {
        changes.add(new MetadataUpdate.RemoveSnapshotRef(name));
      }

      return this;
    }

// Handle main branch specially
if (name == SnapshotRef::kMainBranch) {
metadata_.current_snapshot_id = kInvalidSnapshotId;
}

// Remove the ref from the map
auto it = metadata_.refs.find(name);
if (it != metadata_.refs.end()) {
metadata_.refs.erase(it);
changes_.push_back(std::make_unique<table::RemoveSnapshotRef>(name));
}

return {};
}

Status TableMetadataBuilder::Impl::AddSnapshot(std::shared_ptr<Snapshot> snapshot) {
if (snapshot == nullptr) {
// No-op
return {};
}

// Validate preconditions
ICEBERG_PRECHECK(!metadata_.schemas.empty(),
"Attempting to add a snapshot before a schema is added");
ICEBERG_PRECHECK(!metadata_.partition_specs.empty(),
"Attempting to add a snapshot before a partition spec is added");
ICEBERG_PRECHECK(!metadata_.sort_orders.empty(),
"Attempting to add a snapshot before a sort order is added");

// Check if snapshot already exists
int64_t snapshot_id = snapshot->snapshot_id;
auto existing_snapshot =
std::ranges::find_if(metadata_.snapshots, [snapshot_id](const auto& s) {
return s != nullptr && s->snapshot_id == snapshot_id;
});
ICEBERG_PRECHECK(existing_snapshot == metadata_.snapshots.end(),
"Snapshot already exists for id: {}", snapshot_id);

// Validate sequence number
ICEBERG_PRECHECK(
metadata_.format_version == 1 ||
snapshot->sequence_number > metadata_.last_sequence_number ||
!snapshot->parent_snapshot_id.has_value(),
"Cannot add snapshot with sequence number {} older than last sequence number {}",
snapshot->sequence_number, metadata_.last_sequence_number);

// Update metadata
metadata_.last_updated_ms = snapshot->timestamp_ms;
metadata_.last_sequence_number = snapshot->sequence_number;
metadata_.snapshots.push_back(snapshot);
changes_.push_back(std::make_unique<table::AddSnapshot>(snapshot));

// TODO(xiao.dong) Handle row lineage for format version >= 3
return {};
}

Status TableMetadataBuilder::Impl::RemoveSnapshots(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not align with the Java imp either.

const std::vector<int64_t>& snapshot_ids) {
if (snapshot_ids.empty()) {
return {};
}

std::unordered_set<int64_t> snapshot_ids_set(snapshot_ids.begin(), snapshot_ids.end());

// Build a map of snapshot IDs for quick lookup
std::unordered_map<int64_t, std::shared_ptr<Snapshot>> snapshots_by_id;
for (const auto& snapshot : metadata_.snapshots) {
if (snapshot) {
snapshots_by_id[snapshot->snapshot_id] = snapshot;
}
}

// Filter snapshots to retain
std::vector<std::shared_ptr<Snapshot>> retained_snapshots;
retained_snapshots.reserve(metadata_.snapshots.size());

for (const auto& snapshot : metadata_.snapshots) {
if (!snapshot) continue;

int64_t snapshot_id = snapshot->snapshot_id;
if (snapshot_ids_set.contains(snapshot_id)) {
// Remove from the map
snapshots_by_id.erase(snapshot_id);
// Record the removal
changes_.push_back(
std::make_unique<table::RemoveSnapshots>(std::vector<int64_t>{snapshot_id}));
// Note: Statistics and partition statistics removal would be handled here
// if those features were implemented
} else {
retained_snapshots.push_back(snapshot);
}
}

metadata_.snapshots = std::move(retained_snapshots);

// Remove any refs that are no longer valid (dangling refs)
std::vector<std::string> dangling_refs;
for (const auto& [ref_name, ref] : metadata_.refs) {
if (!snapshots_by_id.contains(ref->snapshot_id)) {
dangling_refs.push_back(ref_name);
}
}

for (const auto& ref_name : dangling_refs) {
ICEBERG_RETURN_UNEXPECTED(RemoveRef(ref_name));
}

return {};
}

Status TableMetadataBuilder::Impl::RemovePartitionSpecs(
const std::vector<int32_t>& spec_ids) {
if (spec_ids.empty()) {
return {};
}

std::unordered_set<int32_t> spec_ids_set(spec_ids.begin(), spec_ids.end());

// Validate that we're not removing the default spec
ICEBERG_PRECHECK(!spec_ids_set.contains(metadata_.default_spec_id),
"Cannot remove the default partition spec");

// Filter partition specs to retain
metadata_.partition_specs =
metadata_.partition_specs | std::views::filter([&](const auto& spec) {
return !spec_ids_set.contains(spec->spec_id());
}) |
std::ranges::to<std::vector<std::shared_ptr<iceberg::PartitionSpec>>>();

// Update the specs_by_id_ index
for (int32_t spec_id : spec_ids) {
specs_by_id_.erase(spec_id);
}

// Record the change
changes_.push_back(std::make_unique<table::RemovePartitionSpecs>(spec_ids));

return {};
}

TableMetadataBuilder::TableMetadataBuilder(int8_t format_version)
: impl_(std::make_unique<Impl>(format_version)) {}

Expand Down Expand Up @@ -1179,7 +1388,8 @@ TableMetadataBuilder& TableMetadataBuilder::AddPartitionSpec(

TableMetadataBuilder& TableMetadataBuilder::RemovePartitionSpecs(
const std::vector<int32_t>& spec_ids) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
ICEBERG_BUILDER_RETURN_IF_ERROR(impl_->RemovePartitionSpecs(spec_ids));
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::RemoveSchemas(
Expand Down Expand Up @@ -1207,7 +1417,8 @@ TableMetadataBuilder& TableMetadataBuilder::AddSortOrder(

TableMetadataBuilder& TableMetadataBuilder::AddSnapshot(
std::shared_ptr<Snapshot> snapshot) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
ICEBERG_BUILDER_RETURN_IF_ERROR(impl_->AddSnapshot(std::move(snapshot)));
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::SetBranchSnapshot(int64_t snapshot_id,
Expand All @@ -1217,11 +1428,13 @@ TableMetadataBuilder& TableMetadataBuilder::SetBranchSnapshot(int64_t snapshot_i

TableMetadataBuilder& TableMetadataBuilder::SetRef(const std::string& name,
std::shared_ptr<SnapshotRef> ref) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
ICEBERG_BUILDER_RETURN_IF_ERROR(impl_->SetRef(name, std::move(ref)));
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::RemoveRef(const std::string& name) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
ICEBERG_BUILDER_RETURN_IF_ERROR(impl_->RemoveRef(name));
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::RemoveSnapshots(
Expand All @@ -1231,7 +1444,8 @@ TableMetadataBuilder& TableMetadataBuilder::RemoveSnapshots(

TableMetadataBuilder& TableMetadataBuilder::RemoveSnapshots(
const std::vector<int64_t>& snapshot_ids) {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
ICEBERG_BUILDER_RETURN_IF_ERROR(impl_->RemoveSnapshots(snapshot_ids));
return *this;
}

TableMetadataBuilder& TableMetadataBuilder::SuppressHistoricalSnapshots() {
Expand Down
8 changes: 5 additions & 3 deletions src/iceberg/table_update.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ std::unique_ptr<TableUpdate> SetDefaultPartitionSpec::Clone() const {
// RemovePartitionSpecs

void RemovePartitionSpecs::ApplyTo(TableMetadataBuilder& builder) const {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
builder.RemovePartitionSpecs(spec_ids_);
}

void RemovePartitionSpecs::GenerateRequirements(TableUpdateContext& context) const {
Expand Down Expand Up @@ -301,7 +301,9 @@ std::unique_ptr<TableUpdate> AddSnapshot::Clone() const {

// RemoveSnapshots

void RemoveSnapshots::ApplyTo(TableMetadataBuilder& builder) const {}
void RemoveSnapshots::ApplyTo(TableMetadataBuilder& builder) const {
builder.RemoveSnapshots(snapshot_ids_);
}

void RemoveSnapshots::GenerateRequirements(TableUpdateContext& context) const {
// RemoveSnapshots doesn't generate any requirements
Expand All @@ -322,7 +324,7 @@ std::unique_ptr<TableUpdate> RemoveSnapshots::Clone() const {
// RemoveSnapshotRef

void RemoveSnapshotRef::ApplyTo(TableMetadataBuilder& builder) const {
throw IcebergError(std::format("{} not implemented", __FUNCTION__));
builder.RemoveRef(ref_name_);
}

void RemoveSnapshotRef::GenerateRequirements(TableUpdateContext& context) const {
Expand Down
1 change: 1 addition & 0 deletions src/iceberg/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ if(ICEBERG_BUILD_BUNDLE)
add_iceberg_test(table_update_test
USE_BUNDLE
SOURCES
expire_snapshots_test.cc
transaction_test.cc
update_partition_spec_test.cc
update_properties_test.cc
Expand Down
Loading
Loading