From 8ed196cd08356d1601d1c37e3e460d60a466c596 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 4 Sep 2025 23:19:24 +0800 Subject: [PATCH 1/2] feat: define table properties with default values Just copied everything from the TableProperties.java as of today --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/table.cc | 18 +- src/iceberg/table.h | 12 +- src/iceberg/table_properties.cc | 47 +++++ src/iceberg/table_properties.h | 303 ++++++++++++++++++++++++++++++++ src/iceberg/type_fwd.h | 1 + 6 files changed, 371 insertions(+), 11 deletions(-) create mode 100644 src/iceberg/table_properties.cc create mode 100644 src/iceberg/table_properties.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index c8fb07721..37cefa3e7 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -41,6 +41,7 @@ set(ICEBERG_SOURCES statistics_file.cc table.cc table_metadata.cc + table_properties.cc table_scan.cc transform.cc transform_function.cc diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc index bff445648..cdcb6a95b 100644 --- a/src/iceberg/table.cc +++ b/src/iceberg/table.cc @@ -26,11 +26,24 @@ #include "iceberg/schema.h" #include "iceberg/sort_order.h" #include "iceberg/table_metadata.h" +#include "iceberg/table_properties.h" #include "iceberg/table_scan.h" #include "iceberg/util/macros.h" namespace iceberg { +Table::~Table() = default; + +Table::Table(TableIdentifier identifier, std::shared_ptr metadata, + std::string metadata_location, std::shared_ptr io, + std::shared_ptr catalog) + : identifier_(std::move(identifier)), + metadata_(std::move(metadata)), + metadata_location_(std::move(metadata_location)), + io_(std::move(io)), + catalog_(std::move(catalog)), + properties_(TableProperties::FromMap(metadata_->properties)) {} + const std::string& Table::uuid() const { return metadata_->table_uuid; } Status Table::Refresh() { @@ -43,6 +56,7 @@ Status Table::Refresh() { metadata_ = std::move(refreshed_table->metadata_); metadata_location_ = std::move(refreshed_table->metadata_location_); io_ = std::move(refreshed_table->io_); + properties_ = std::move(refreshed_table->properties_); schemas_map_.reset(); partition_spec_map_.reset(); @@ -99,9 +113,7 @@ Table::sort_orders() const { return sort_orders_map_; } -const std::unordered_map& Table::properties() const { - return metadata_->properties; -} +const TableProperties& Table::properties() const { return *properties_; } const std::string& Table::location() const { return metadata_->location; } diff --git a/src/iceberg/table.h b/src/iceberg/table.h index 9fce3c29f..0ce006625 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -33,7 +33,7 @@ namespace iceberg { /// \brief Represents an Iceberg table class ICEBERG_EXPORT Table { public: - virtual ~Table() = default; + ~Table(); /// \brief Construct a table. /// \param[in] identifier The identifier of the table. @@ -44,12 +44,7 @@ class ICEBERG_EXPORT Table { /// be read-only. Table(TableIdentifier identifier, std::shared_ptr metadata, std::string metadata_location, std::shared_ptr io, - std::shared_ptr catalog) - : identifier_(std::move(identifier)), - metadata_(std::move(metadata)), - metadata_location_(std::move(metadata_location)), - io_(std::move(io)), - catalog_(std::move(catalog)) {}; + std::shared_ptr catalog); /// \brief Return the identifier of this table const TableIdentifier& name() const { return identifier_; } @@ -85,7 +80,7 @@ class ICEBERG_EXPORT Table { sort_orders() const; /// \brief Return a map of string properties for this table - const std::unordered_map& properties() const; + const TableProperties& properties() const; /// \brief Return the table's base location const std::string& location() const; @@ -122,6 +117,7 @@ class ICEBERG_EXPORT Table { std::string metadata_location_; std::shared_ptr io_; std::shared_ptr catalog_; + std::unique_ptr properties_; // Cache lazy-initialized maps. mutable std::shared_ptr>> diff --git a/src/iceberg/table_properties.cc b/src/iceberg/table_properties.cc new file mode 100644 index 000000000..b82fe59b8 --- /dev/null +++ b/src/iceberg/table_properties.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/table_properties.h" + +namespace iceberg { + +const std::unordered_set& TableProperties::reserved_properties() { + static const std::unordered_set kReservedProperties = { + kFormatVersion.key(), kUuid.key(), + kSnapshotCount.key(), kCurrentSnapshotId.key(), + kCurrentSnapshotSummary.key(), kCurrentSnapshotTimestamp.key(), + kCurrentSchema.key(), kDefaultPartitionSpec.key(), + kDefaultSortOrder.key()}; + return kReservedProperties; +} + +std::unique_ptr TableProperties::default_properties() { + return std::make_unique(); +} + +std::unique_ptr TableProperties::FromMap( + const std::unordered_map& properties) { + auto table_properties = std::make_unique(); + for (const auto& [key, value] : properties) { // NOLINT(modernize-type-traits) + table_properties->configs_[key] = value; + } + return table_properties; +} + +} // namespace iceberg diff --git a/src/iceberg/table_properties.h b/src/iceberg/table_properties.h new file mode 100644 index 000000000..3d4257703 --- /dev/null +++ b/src/iceberg/table_properties.h @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/util/config.h" + +namespace iceberg { + +/// \brief Table properties for Iceberg tables. +/// +/// This class provides configuration entries for various Iceberg table properties +/// including format settings, commit behavior, file formats, compression settings, +/// and other table-level configurations. +class ICEBERG_EXPORT TableProperties : public ConfigBase { + public: + template + using Entry = const ConfigBase::Entry; + + // Reserved table properties + + /// \brief Reserved table property for table format version. + /// + /// Iceberg will default a new table's format version to the latest stable and + /// recommended version. This reserved property keyword allows users to override the + /// Iceberg format version of the table metadata. + /// + /// If this table property exists when creating a table, the table will use the + /// specified format version. If a table updates this property, it will try to upgrade + /// to the specified format version. + /// + /// \note incomplete or unstable versions cannot be selected using this property. + inline static Entry kFormatVersion{"format-version", ""}; + /// \brief Reserved table property for table UUID. + inline static Entry kUuid{"uuid", ""}; + /// \brief Reserved table property for the total number of snapshots. + inline static Entry kSnapshotCount{"snapshot-count", ""}; + /// \brief Reserved table property for current snapshot summary. + inline static Entry kCurrentSnapshotSummary{"current-snapshot-summary", + ""}; + /// \brief Reserved table property for current snapshot id. + inline static Entry kCurrentSnapshotId{"current-snapshot-id", ""}; + /// \brief Reserved table property for current snapshot timestamp. + inline static Entry kCurrentSnapshotTimestamp{ + "current-snapshot-timestamp-ms", ""}; + /// \brief Reserved table property for the JSON representation of current schema. + inline static Entry kCurrentSchema{"current-schema", ""}; + /// \brief Reserved table property for the JSON representation of current(default) + /// partition spec. + inline static Entry kDefaultPartitionSpec{"default-partition-spec", ""}; + /// \brief Reserved table property for the JSON representation of current(default) sort + /// order. + inline static Entry kDefaultSortOrder{"default-sort-order", ""}; + + // Commit properties + + inline static Entry kCommitNumRetries{"commit.retry.num-retries", 4}; + inline static Entry kCommitMinRetryWaitMs{"commit.retry.min-wait-ms", 100}; + inline static Entry kCommitMaxRetryWaitMs{"commit.retry.max-wait-ms", + 60 * 1000}; // 1 minute + inline static Entry kCommitTotalRetryTimeMs{"commit.retry.total-timeout-ms", + 30 * 60 * 1000}; // 30 minutes + inline static Entry kCommitNumStatusChecks{"commit.status-check.num-retries", + 3}; + inline static Entry kCommitStatusChecksMinWaitMs{ + "commit.status-check.min-wait-ms", int64_t{1000}}; // 1 second + inline static Entry kCommitStatusChecksMaxWaitMs{ + "commit.status-check.max-wait-ms", int64_t{60 * 1000}}; // 1 minute + inline static Entry kCommitStatusChecksTotalWaitMs{ + "commit.status-check.total-timeout-ms", int64_t{30 * 60 * 1000}}; // 30 minutes + + // Manifest properties + + inline static Entry kManifestTargetSizeBytes{ + "commit.manifest.target-size-bytes", int64_t{8 * 1024 * 1024}}; // 8 MB + inline static Entry kManifestMinMergeCount{ + "commit.manifest.min-count-to-merge", 100}; + inline static Entry kManifestMergeEnabled{"commit.manifest-merge.enabled", true}; + + // File format properties + + inline static Entry kDefaultFileFormat{"write.format.default", "parquet"}; + inline static Entry kDeleteDefaultFileFormat{"write.delete.format.default", + "parquet"}; + + // Parquet properties + + inline static Entry kParquetRowGroupSizeBytes{ + "write.parquet.row-group-size-bytes", 128 * 1024 * 1024}; // 128 MB + inline static Entry kDeleteParquetRowGroupSizeBytes{ + "write.delete.parquet.row-group-size-bytes", 128 * 1024 * 1024}; // 128 MB + inline static Entry kParquetPageSizeBytes{"write.parquet.page-size-bytes", + 1024 * 1024}; // 1 MB + inline static Entry kDeleteParquetPageSizeBytes{ + "write.delete.parquet.page-size-bytes", 1024 * 1024}; // 1 MB + inline static Entry kParquetPageRowLimit{"write.parquet.page-row-limit", + 20'000}; + inline static Entry kDeleteParquetPageRowLimit{ + "write.delete.parquet.page-row-limit", 20'000}; + inline static Entry kParquetDictSizeBytes{"write.parquet.dict-size-bytes", + 2 * 1024 * 1024}; // 2 MB + inline static Entry kDeleteParquetDictSizeBytes{ + "write.delete.parquet.dict-size-bytes", 2 * 1024 * 1024}; // 2 MB + inline static Entry kParquetCompression{"write.parquet.compression-codec", + "zstd"}; + inline static Entry kDeleteParquetCompression{ + "write.delete.parquet.compression-codec", "zstd"}; + inline static Entry kParquetCompressionLevel{ + "write.parquet.compression-level", ""}; + inline static Entry kDeleteParquetCompressionLevel{ + "write.delete.parquet.compression-level", ""}; + inline static Entry kParquetRowGroupCheckMinRecordCount{ + "write.parquet.row-group-check-min-record-count", 100}; + inline static Entry kDeleteParquetRowGroupCheckMinRecordCount{ + "write.delete.parquet.row-group-check-min-record-count", 100}; + inline static Entry kParquetRowGroupCheckMaxRecordCount{ + "write.parquet.row-group-check-max-record-count", 10'000}; + inline static Entry kDeleteParquetRowGroupCheckMaxRecordCount{ + "write.delete.parquet.row-group-check-max-record-count", 10'000}; + inline static Entry kParquetBloomFilterMaxBytes{ + "write.parquet.bloom-filter-max-bytes", 1024 * 1024}; // 1 MB + inline static std::string_view kParquetBloomFilterColumnFppPrefix{ + "write.parquet.bloom-filter-fpp.column."}; + inline static std::string_view kParquetBloomFilterColumnEnabledPrefix{ + "write.parquet.bloom-filter-enabled.column."}; + inline static std::string_view kParquetColumnStatsEnabledPrefix{ + "write.parquet.stats-enabled.column."}; + + // Avro properties + inline static Entry kAvroCompression{"write.avro.compression-codec", + "gzip"}; + inline static Entry kDeleteAvroCompression{ + "write.delete.avro.compression-codec", "gzip"}; + inline static Entry kAvroCompressionLevel{"write.avro.compression-level", + ""}; + inline static Entry kDeleteAvroCompressionLevel{ + "write.delete.avro.compression-level", ""}; + + // ORC properties + inline static Entry kOrcStripeSizeBytes{"write.orc.stripe-size-bytes", + int64_t{64} * 1024 * 1024}; + inline static Entry kOrcBloomFilterColumns{ + "write.orc.bloom.filter.columns", ""}; + inline static Entry kOrcBloomFilterFpp{"write.orc.bloom.filter.fpp", 0.05}; + inline static Entry kDeleteOrcStripeSizeBytes{ + "write.delete.orc.stripe-size-bytes", int64_t{64} * 1024 * 1024}; // 64 MB + inline static Entry kOrcBlockSizeBytes{"write.orc.block-size-bytes", + int64_t{256} * 1024 * 1024}; // 256 MB + inline static Entry kDeleteOrcBlockSizeBytes{ + "write.delete.orc.block-size-bytes", int64_t{256} * 1024 * 1024}; // 256 MB + inline static Entry kOrcWriteBatchSize{"write.orc.vectorized.batch-size", + 1024}; + inline static Entry kDeleteOrcWriteBatchSize{ + "write.delete.orc.vectorized.batch-size", 1024}; + inline static Entry kOrcCompression{"write.orc.compression-codec", "zlib"}; + inline static Entry kDeleteOrcCompression{ + "write.delete.orc.compression-codec", "zlib"}; + inline static Entry kOrcCompressionStrategy{ + "write.orc.compression-strategy", "speed"}; + inline static Entry kDeleteOrcCompressionStrategy{ + "write.delete.orc.compression-strategy", "speed"}; + + // Read properties + + inline static Entry kSplitSize{"read.split.target-size", + int64_t{128} * 1024 * 1024}; // 128 MB + inline static Entry kMetadataSplitSize{"read.split.metadata-target-size", + int64_t{32} * 1024 * 1024}; // 32 MB + inline static Entry kSplitLookback{"read.split.planning-lookback", 10}; + inline static Entry kSplitOpenFileCost{"read.split.open-file-cost", + int64_t{4} * 1024 * 1024}; // 4 MB + inline static Entry kAdaptiveSplitSizeEnabled{"read.split.adaptive-size.enabled", + true}; + inline static Entry kParquetVectorizationEnabled{ + "read.parquet.vectorization.enabled", true}; + inline static Entry kParquetBatchSize{"read.parquet.vectorization.batch-size", + 5000}; + inline static Entry kOrcVectorizationEnabled{"read.orc.vectorization.enabled", + false}; + inline static Entry kOrcBatchSize{"read.orc.vectorization.batch-size", 5000}; + inline static Entry kDataPlanningMode{"read.data-planning-mode", "auto"}; + inline static Entry kDeletePlanningMode{"read.delete-planning-mode", + "auto"}; + + // Write properties + + inline static Entry kObjectStoreEnabled{"write.object-storage.enabled", false}; + /// \brief Excludes the partition values in the path when set to true and object store + /// is enabled. + inline static Entry kWriteObjectStorePartitionedPaths{ + "write.object-storage.partitioned-paths", true}; + /// \brief This only applies to files written after this property is set. Files + /// previously written aren't relocated to reflect this parameter. If not set, defaults + /// to a "data" folder underneath the root path of the table. + inline static Entry kWriteDataLocation{"write.data.path", ""}; + /// \brief This only applies to files written after this property is set. Files + /// previously written aren't relocated to reflect this parameter. If not set, defaults + /// to a "metadata" folder underneath the root path of the table. + inline static Entry kWriteMetadataLocation{"write.metadata.path", ""}; + inline static Entry kWritePartitionSummaryLimit{ + "write.summary.partition-limit", 0}; + inline static Entry kMetadataCompression{ + "write.metadata.compression-codec", "none"}; + inline static Entry kMetadataPreviousVersionsMax{ + "write.metadata.previous-versions-max", 100}; + /// \brief This enables to delete the oldest metadata file after commit. + inline static Entry kMetadataDeleteAfterCommitEnabled{ + "write.metadata.delete-after-commit.enabled", false}; + inline static Entry kMetricsMaxInferredColumnDefaults{ + "write.metadata.metrics.max-inferred-column-defaults", 100}; + inline static Entry kDefaultWriteMetricsMode{ + "write.metadata.metrics.default", "truncate(16)"}; + + inline static std::string_view kDefaultNameMapping{"schema.name-mapping.default"}; + + inline static Entry kWriteAuditPublishEnabled{"write.wap.enabled", false}; + inline static Entry kWriteTargetFileSizeBytes{ + "write.target-file-size-bytes", int64_t{512} * 1024 * 1024}; // 512 MB + inline static Entry kDeleteTargetFileSizeBytes{ + "write.delete.target-file-size-bytes", int64_t{64} * 1024 * 1024}; // 64 MB + + inline static Entry kSnapshotIdInheritanceEnabled{ + "compatibility.snapshot-id-inheritance.enabled", false}; + + // Garbage collection properties + + inline static Entry kGcEnabled{"gc.enabled", true}; + inline static Entry kMaxSnapshotAgeMs{ + "history.expire.max-snapshot-age-ms", int64_t{5} * 24 * 60 * 60 * 1000}; // 5 days + inline static Entry kMinSnapshotsToKeep{"history.expire.min-snapshots-to-keep", + 1}; + inline static Entry kMaxRefAgeMs{"history.expire.max-ref-age-ms", + std::numeric_limits::max()}; + + // Delete/Update/Merge properties + + inline static Entry kDeleteGranularity{"write.delete.granularity", + "partition"}; + inline static Entry kDeleteIsolationLevel{"write.delete.isolation-level", + "serializable"}; + inline static Entry kDeleteMode{"write.delete.mode", "copy-on-write"}; + + inline static Entry kUpdateIsolationLevel{"write.update.isolation-level", + "serializable"}; + inline static Entry kUpdateMode{"write.update.mode", "copy-on-write"}; + + inline static Entry kMergeIsolationLevel{"write.merge.isolation-level", + "serializable"}; + inline static Entry kMergeMode{"write.merge.mode", "copy-on-write"}; + + inline static Entry kUpsertEnabled{"write.upsert.enabled", false}; + + // Encryption properties + + inline static Entry kEncryptionTableKey{"encryption.key-id", ""}; + inline static Entry kEncryptionDekLength{"encryption.data-key-length", 16}; + + /// \brief Get the set of reserved table property keys. + /// + /// Reserved table properties are only used to control behaviors when creating + /// or updating a table. The values of these properties are not persisted as + /// part of the table metadata. + /// + /// \return The set of reserved property keys + static const std::unordered_set& reserved_properties(); + + /// \brief Create a default TableProperties instance. + /// + /// \return A unique pointer to a TableProperties instance with default values + static std::unique_ptr default_properties(); + + /// \brief Create a TableProperties instance from a map of key-value pairs. + /// + /// \param properties The map containing property key-value pairs + /// \return A unique pointer to a TableProperties instance + static std::unique_ptr FromMap( + const std::unordered_map& properties); +}; + +} // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 09d836e8e..01d1a26b3 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -92,6 +92,7 @@ class LocationProvider; class SortField; class SortOrder; class Table; +class TableProperties; class FileIO; class Transaction; class Transform; From 454aaa56fa64f2e15c940e27a19616221ecda83a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 30 Sep 2025 22:33:44 +0800 Subject: [PATCH 2/2] remove kSnapshotIdInheritanceEnabled --- src/iceberg/table_properties.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/iceberg/table_properties.h b/src/iceberg/table_properties.h index 3d4257703..1ab98fec8 100644 --- a/src/iceberg/table_properties.h +++ b/src/iceberg/table_properties.h @@ -242,9 +242,6 @@ class ICEBERG_EXPORT TableProperties : public ConfigBase { inline static Entry kDeleteTargetFileSizeBytes{ "write.delete.target-file-size-bytes", int64_t{64} * 1024 * 1024}; // 64 MB - inline static Entry kSnapshotIdInheritanceEnabled{ - "compatibility.snapshot-id-inheritance.enabled", false}; - // Garbage collection properties inline static Entry kGcEnabled{"gc.enabled", true};