From 6948a2939bc489710ea9ed36a4e0b183e979eb62 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Wed, 17 Sep 2025 16:39:35 +0800 Subject: [PATCH 1/4] feat: add Metadata method to Reader and its implementations - Introduced a new virtual method `Metadata()` in the `Reader` class to retrieve file metadata. - Implemented `Metadata()` in `AvroReader` to return key-value pairs from the Avro file's metadata. - Implemented `Metadata()` in `ParquetReader` to extract and return key-value pairs from the Parquet file's metadata. --- src/iceberg/avro/avro_reader.cc | 22 ++++++++++++++++++++ src/iceberg/avro/avro_reader.h | 2 ++ src/iceberg/file_reader.h | 3 +++ src/iceberg/parquet/parquet_reader.cc | 30 +++++++++++++++++++++++++++ src/iceberg/parquet/parquet_reader.h | 2 ++ 5 files changed, 59 insertions(+) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 048cd4997..581ec115b 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -173,6 +173,24 @@ class AvroReader::Impl { return arrow_schema; } + Result> Metadata() { + if (reader_ == nullptr) { + return InvalidArgument("Reader is not opened"); + } + + auto metadata = reader_->metadata(); + + std::unordered_map metadata_map; + metadata_map.reserve(metadata.size()); + + for (const auto& pair : metadata) { + metadata_map.try_emplace(pair.first, + std::string(pair.second.begin(), pair.second.end())); + } + + return metadata_map; + } + private: Status InitReadContext() { context_ = std::make_unique(); @@ -241,6 +259,10 @@ Result> AvroReader::Next() { return impl_->Next(); } Result AvroReader::Schema() { return impl_->Schema(); } +Result> AvroReader::Metadata() { + return impl_->Metadata(); +} + Status AvroReader::Open(const ReaderOptions& options) { impl_ = std::make_unique(); return impl_->Open(options); diff --git a/src/iceberg/avro/avro_reader.h b/src/iceberg/avro/avro_reader.h index 07737bb7b..24f95f5d4 100644 --- a/src/iceberg/avro/avro_reader.h +++ b/src/iceberg/avro/avro_reader.h @@ -39,6 +39,8 @@ class ICEBERG_BUNDLE_EXPORT AvroReader : public Reader { Result Schema() final; + Result> Metadata() final; + private: class Impl; std::unique_ptr impl_; diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h index 8a59e33fe..65b88520c 100644 --- a/src/iceberg/file_reader.h +++ b/src/iceberg/file_reader.h @@ -54,6 +54,9 @@ class ICEBERG_EXPORT Reader { /// \brief Get the schema of the data. virtual Result Schema() = 0; + + /// \brief Get the metadata of the file + virtual Result> Metadata() = 0; }; /// \brief A split of the file to read. diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index 4c86802b3..87bce816b 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +186,31 @@ class ParquetReader::Impl { return arrow_schema; } + Result> Metadata() { + if (reader_ == nullptr) { + return InvalidArgument("Reader is not opened"); + } + + auto metadata = reader_->parquet_reader()->metadata(); + if (!metadata) { + return InvalidArgument("Failed to get Parquet file metadata"); + } + + auto kv_metadata = metadata->key_value_metadata(); + if (!kv_metadata) { + return std::unordered_map{}; + } + + std::unordered_map metadata_map; + metadata_map.reserve(kv_metadata->size()); + + for (int i = 0; i < kv_metadata->size(); ++i) { + metadata_map.try_emplace(kv_metadata->key(i), kv_metadata->value(i)); + } + + return metadata_map; + } + private: Status InitReadContext() { context_ = std::make_unique(); @@ -251,6 +277,10 @@ Result> ParquetReader::Next() { return impl_->Next(); Result ParquetReader::Schema() { return impl_->Schema(); } +Result> ParquetReader::Metadata() { + return impl_->Metadata(); +} + Status ParquetReader::Open(const ReaderOptions& options) { impl_ = std::make_unique(); return impl_->Open(options); diff --git a/src/iceberg/parquet/parquet_reader.h b/src/iceberg/parquet/parquet_reader.h index 23d34dfa9..0604230c8 100644 --- a/src/iceberg/parquet/parquet_reader.h +++ b/src/iceberg/parquet/parquet_reader.h @@ -39,6 +39,8 @@ class ICEBERG_BUNDLE_EXPORT ParquetReader : public Reader { Result Schema() final; + Result> Metadata() final; + private: class Impl; std::unique_ptr impl_; From 7c1aa3d8c84e0f6258296666bbd076bfe5f5cc0b Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Fri, 19 Sep 2025 16:04:41 +0800 Subject: [PATCH 2/4] fix review --- src/iceberg/avro/avro_reader.cc | 11 +++++++---- src/iceberg/parquet/parquet_reader.cc | 12 ++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 581ec115b..3f0f8409e 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -175,17 +175,20 @@ class AvroReader::Impl { Result> Metadata() { if (reader_ == nullptr) { - return InvalidArgument("Reader is not opened"); + return Invalid("Reader is not opened"); } - auto metadata = reader_->metadata(); + const auto& metadata = reader_->metadata(); std::unordered_map metadata_map; metadata_map.reserve(metadata.size()); for (const auto& pair : metadata) { - metadata_map.try_emplace(pair.first, - std::string(pair.second.begin(), pair.second.end())); + auto [it, inserted] = metadata_map.try_emplace( + pair.first, std::string(pair.second.begin(), pair.second.end())); + if (!inserted) { + return Invalid("Duplicate metadata key found: {}", pair.first); + } } return metadata_map; diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index 87bce816b..9923c8392 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -188,15 +188,15 @@ class ParquetReader::Impl { Result> Metadata() { if (reader_ == nullptr) { - return InvalidArgument("Reader is not opened"); + return Invalid("Reader is not opened"); } auto metadata = reader_->parquet_reader()->metadata(); if (!metadata) { - return InvalidArgument("Failed to get Parquet file metadata"); + return Invalid("Failed to get Parquet file metadata"); } - auto kv_metadata = metadata->key_value_metadata(); + const auto& kv_metadata = metadata->key_value_metadata(); if (!kv_metadata) { return std::unordered_map{}; } @@ -205,7 +205,11 @@ class ParquetReader::Impl { metadata_map.reserve(kv_metadata->size()); for (int i = 0; i < kv_metadata->size(); ++i) { - metadata_map.try_emplace(kv_metadata->key(i), kv_metadata->value(i)); + auto [it, inserted] = + metadata_map.try_emplace(kv_metadata->key(i), kv_metadata->value(i)); + if (!inserted) { + return Invalid("Duplicate metadata key found: {}", kv_metadata->key(i)); + } } return metadata_map; From 54f4cccef92cb07949bdb4c37821085e5c26c950 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Mon, 22 Sep 2025 10:51:48 +0800 Subject: [PATCH 3/4] fix review --- src/iceberg/avro/avro_reader.cc | 7 ++----- src/iceberg/parquet/parquet_reader.cc | 6 +----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 3f0f8409e..64526123f 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -184,11 +184,8 @@ class AvroReader::Impl { metadata_map.reserve(metadata.size()); for (const auto& pair : metadata) { - auto [it, inserted] = metadata_map.try_emplace( - pair.first, std::string(pair.second.begin(), pair.second.end())); - if (!inserted) { - return Invalid("Duplicate metadata key found: {}", pair.first); - } + metadata_map.insert_or_assign(pair.first, + std::string(pair.second.begin(), pair.second.end())); } return metadata_map; diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index 9923c8392..b6ab97593 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -205,11 +205,7 @@ class ParquetReader::Impl { metadata_map.reserve(kv_metadata->size()); for (int i = 0; i < kv_metadata->size(); ++i) { - auto [it, inserted] = - metadata_map.try_emplace(kv_metadata->key(i), kv_metadata->value(i)); - if (!inserted) { - return Invalid("Duplicate metadata key found: {}", kv_metadata->key(i)); - } + metadata_map.insert_or_assign(kv_metadata->key(i), kv_metadata->value(i)); } return metadata_map; From a88d6811569073417bf364ac31fc12d88d9e94b1 Mon Sep 17 00:00:00 2001 From: nullccxsy <32149055912@qq.com> Date: Tue, 23 Sep 2025 17:05:06 +0800 Subject: [PATCH 4/4] fix review --- src/iceberg/file_reader.h | 2 +- src/iceberg/parquet/parquet_reader.cc | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h index 65b88520c..d25a5e451 100644 --- a/src/iceberg/file_reader.h +++ b/src/iceberg/file_reader.h @@ -55,7 +55,7 @@ class ICEBERG_EXPORT Reader { /// \brief Get the schema of the data. virtual Result Schema() = 0; - /// \brief Get the metadata of the file + /// \brief Get the metadata of the file. virtual Result> Metadata() = 0; }; diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index b6ab97593..e57b98e87 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -202,11 +202,7 @@ class ParquetReader::Impl { } std::unordered_map metadata_map; - metadata_map.reserve(kv_metadata->size()); - - for (int i = 0; i < kv_metadata->size(); ++i) { - metadata_map.insert_or_assign(kv_metadata->key(i), kv_metadata->value(i)); - } + kv_metadata->ToUnorderedMap(&metadata_map); return metadata_map; }