Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ set(ICEBERG_SOURCES
schema_internal.cc
partition_field.cc
partition_spec.cc
statistics_file.cc
table_metadata.cc
transform.cc
type.cc)

Expand Down
82 changes: 82 additions & 0 deletions src/iceberg/statistics_file.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/statistics_file.h"

#include <format>

namespace iceberg {

bool BlobMetadata::Equals(const BlobMetadata& other) const {
return type == other.type && source_snapshot_id == other.source_snapshot_id &&
source_snapshot_sequence_number == other.source_snapshot_sequence_number &&
fields == other.fields && properties == other.properties;
}

std::string BlobMetadata::ToString() const {
std::string repr = "BlobMetadata[";
std::format_to(std::back_inserter(repr),
"type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type,
source_snapshot_id, source_snapshot_sequence_number);
std::format_to(std::back_inserter(repr), "fields=[");
for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) {
if (iter != fields.cbegin()) {
std::format_to(std::back_inserter(repr), ",{}", *iter);
} else {
std::format_to(std::back_inserter(repr), "{}", *iter);
}
}
std::format_to(std::back_inserter(repr), "],properties=[");
for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) {
const auto& [key, value] = *iter;
if (iter != properties.cbegin()) {
std::format_to(std::back_inserter(repr), ",{}:{}", key, value);
} else {
std::format_to(std::back_inserter(repr), "{}:{}", key, value);
}
}
repr += "]]";
return repr;
}

bool StatisticsFile::Equals(const StatisticsFile& other) const {
return snapshot_id == other.snapshot_id && path == other.path &&
file_size_in_bytes == other.file_size_in_bytes &&
file_footer_size_in_bytes == other.file_footer_size_in_bytes &&
blob_metadata == other.blob_metadata;
}

std::string StatisticsFile::ToString() const {
std::string repr = "StatisticsFile[";
std::format_to(std::back_inserter(repr),
"snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},",
snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes);
std::format_to(std::back_inserter(repr), "blobMetadata=[");
for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) {
if (iter != blob_metadata.cbegin()) {
std::format_to(std::back_inserter(repr), ",{}", iter->ToString());
} else {
std::format_to(std::back_inserter(repr), "{}", iter->ToString());
}
}
repr += "]]";
return repr;
}

} // namespace iceberg
104 changes: 104 additions & 0 deletions src/iceberg/statistics_file.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/statistics_file.h
/// Statistics file for Iceberg tables.

#include <cstdint>
#include <string>
#include <unordered_map>
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/util/formattable.h"

namespace iceberg {

/// \brief A metadata about a statistics or indices blob
struct ICEBERG_EXPORT BlobMetadata : public util::Formattable {
/// Type of the blob
std::string type;
/// ID of the Iceberg table's snapshot the blob was computed from
int64_t source_snapshot_id;
/// Sequence number of the Iceberg table's snapshot the blob was computed from
int64_t source_snapshot_sequence_number;
/// Ordered list of fields the blob was calculated from
std::vector<int32_t> fields;
/// Additional properties of the blob, specific to the blob type
std::unordered_map<std::string, std::string> properties;

/// \brief Compare two BlobMetadatas for equality.
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) {
return lhs.Equals(rhs);
}

/// \brief Compare two BlobMetadatas for inequality.
friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) {
return !(lhs == rhs);
}

std::string ToString() const override;

private:
bool Equals(const BlobMetadata& other) const;
};

/// \brief Represents a statistics file in the Puffin format
struct ICEBERG_EXPORT StatisticsFile : public util::Formattable {
/// ID of the Iceberg table's snapshot the statistics file is associated with
int64_t snapshot_id;
/// Fully qualified path to the file
std::string path;
/// The size of the file in bytes
int64_t file_size_in_bytes;
/// The size of the file footer in bytes
int64_t file_footer_size_in_bytes;
/// List of statistics contained in the file
std::vector<BlobMetadata> blob_metadata;

/// \brief Compare two StatisticsFiles for equality.
friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) {
return lhs.Equals(rhs);
}

/// \brief Compare two StatisticsFiles for inequality.
friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) {
return !(lhs == rhs);
}

std::string ToString() const override;

private:
bool Equals(const StatisticsFile& other) const;
};

/// \brief Represents a partition statistics file
struct ICEBERG_EXPORT PartitionStatisticsFile {
/// Snapshot ID of the Iceberg table's snapshot the partition statistics file is
/// associated with
int64_t snapshot_id;
/// Fully qualified path to the file
std::string path;
/// The size of the partition statistics file in bytes
int64_t file_size_in_bytes;
};

} // namespace iceberg
38 changes: 38 additions & 0 deletions src/iceberg/table_metadata.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/table_metadata.h"

#include <format>
#include <string>

#include "iceberg/statistics_file.h"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason why this include is here and not in table_metadata.h where we seem to require the following?

  std::vector<std::shared_ptr<struct StatisticsFile>> statistics;
  /// A list of partition statistics
  std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just want to use forward declaration as much as possible. The implementation is not yet complete due to missing the concrete implementation of Snapshot and other classes so it looks weird that iceberg/statistics_file.h is included but not used at the moment.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but as those would still be part of the table metadata API wouldn't it be better to be part of the header file? I am learning C++, I just thought it would still be better there even when we add the implementation part but I might be wrong :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a common practice to use forward declaration to speed up compilation if the implementation detail is not required in the header file, though I suspect that modern compilers are smart enough to optimize this.


namespace iceberg {

std::string SnapshotLogEntry::ToString() const {
return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms,
snapshot_id);
}

std::string MetadataLogEntry::ToString() const {
return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file);
}

} // namespace iceberg
120 changes: 120 additions & 0 deletions src/iceberg/table_metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

/// \file iceberg/table_metadata.h
/// Table metadata for Iceberg tables.

#include <chrono>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/type_fwd.h"
#include "iceberg/util/formattable.h"

namespace iceberg {

using TimePointMs =
std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>;

/// \brief Represents a snapshot log entry
struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable {
/// The timestamp in milliseconds of the change
TimePointMs timestamp_ms;
/// ID of the snapshot
int64_t snapshot_id;

std::string ToString() const override;
};

/// \brief Represents a metadata log entry
struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable {
/// The timestamp in milliseconds of the change
TimePointMs timestamp_ms;
/// Metadata file location
std::string file;

std::string ToString() const override;
};

/// \brief Represents the metadata for an Iceberg table
///
/// Note that it only contains table metadata from the spec. Compared to the Java
/// implementation, missing pieces including: 1) Map<Integer,
/// Schema|PartitionSpec|SortOrder> 2) List<MetadataUpdate> 3) Map<Long, Snapshot> 4)
/// Map<String, SnapshotRef>
///
/// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are
/// implemented.
struct ICEBERG_EXPORT TableMetadata {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to mark some fields as optional?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think we need but I'm still unclear which one should change. I will postpone this decision until we implement the json serialization from/to different table format versions.

/// An integer version number for the format
int8_t format_version;
/// A UUID that identifies the table
std::string table_uuid;
/// The table's base location
std::string location;
/// The table's highest assigned sequence number
int64_t last_sequence_number;
/// Timestamp in milliseconds from the unix epoch when the table was last updated.
int64_t last_updated_ms;
/// The highest assigned column ID for the table
int32_t last_column_id;
/// A list of schemas
std::vector<std::shared_ptr<Schema>> schemas;
/// ID of the table's current schema
int32_t current_schema_id;
/// A list of partition specs
std::vector<std::shared_ptr<PartitionSpec>> partition_specs;
/// ID of the current partition spec that writers should use by default
int32_t default_spec_id;
/// The highest assigned partition field ID across all partition specs for the table
int32_t last_partition_id;
/// A string to string map of table properties
std::unordered_map<std::string, std::string> properties;
/// ID of the current table snapshot
int64_t current_snapshot_id;
/// A list of valid snapshots
std::vector<std::shared_ptr<Snapshot>> snapshots;
/// A list of timestamp and snapshot ID pairs that encodes changes to the current
/// snapshot for the table
std::vector<SnapshotLogEntry> snapshot_log;
/// A list of timestamp and metadata file location pairs that encodes changes to the
/// previous metadata files for the table
std::vector<MetadataLogEntry> metadata_log;
/// A list of sort orders
std::vector<std::shared_ptr<SortOrder>> sort_orders;
/// Default sort order id of the table
int32_t default_sort_order_id;
/// A map of snapshot references
std::unordered_map<std::string, std::string> refs;
/// A list of table statistics
std::vector<std::shared_ptr<struct StatisticsFile>> statistics;
/// A list of partition statistics
std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics;
/// whether or not to track the creation and updates to rows in the table
bool row_lineage_enabled = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Row lineage will always be enabled from V3 onwards: apache/iceberg#12593 So I think we can remove this 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the heads-up! I found that the Java impl still has this field which looks weird to me.

/// A `long` higher than all assigned row IDs
int64_t next_row_id;
};

} // namespace iceberg
Loading