Skip to content

Commit 17b5b2d

Browse files
nullccxsynullccxsy
andauthored
feat: add Metadata method to Reader and its implementations (#235)
- Introduced a new virtual method `Metadata()` in the `Reader` class to retrieve file metadata. - Implemented `Metadata()` in `AvroReader` to return key-value pairs from the Avro file's metadata. - Implemented `Metadata()` in `ParquetReader` to extract and return key-value pairs from the Parquet file's metadata. --------- Co-authored-by: nullccxsy <[email protected]>
1 parent 257b1ad commit 17b5b2d

File tree

5 files changed

+55
-0
lines changed

5 files changed

+55
-0
lines changed

src/iceberg/avro/avro_reader.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,24 @@ class AvroReader::Impl {
173173
return arrow_schema;
174174
}
175175

176+
Result<std::unordered_map<std::string, std::string>> Metadata() {
177+
if (reader_ == nullptr) {
178+
return Invalid("Reader is not opened");
179+
}
180+
181+
const auto& metadata = reader_->metadata();
182+
183+
std::unordered_map<std::string, std::string> metadata_map;
184+
metadata_map.reserve(metadata.size());
185+
186+
for (const auto& pair : metadata) {
187+
metadata_map.insert_or_assign(pair.first,
188+
std::string(pair.second.begin(), pair.second.end()));
189+
}
190+
191+
return metadata_map;
192+
}
193+
176194
private:
177195
Status InitReadContext() {
178196
context_ = std::make_unique<ReadContext>();
@@ -241,6 +259,10 @@ Result<std::optional<ArrowArray>> AvroReader::Next() { return impl_->Next(); }
241259

242260
Result<ArrowSchema> AvroReader::Schema() { return impl_->Schema(); }
243261

262+
Result<std::unordered_map<std::string, std::string>> AvroReader::Metadata() {
263+
return impl_->Metadata();
264+
}
265+
244266
Status AvroReader::Open(const ReaderOptions& options) {
245267
impl_ = std::make_unique<Impl>();
246268
return impl_->Open(options);

src/iceberg/avro/avro_reader.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class ICEBERG_BUNDLE_EXPORT AvroReader : public Reader {
3939

4040
Result<ArrowSchema> Schema() final;
4141

42+
Result<std::unordered_map<std::string, std::string>> Metadata() final;
43+
4244
private:
4345
class Impl;
4446
std::unique_ptr<Impl> impl_;

src/iceberg/file_reader.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ class ICEBERG_EXPORT Reader {
5454

5555
/// \brief Get the schema of the data.
5656
virtual Result<ArrowSchema> Schema() = 0;
57+
58+
/// \brief Get the metadata of the file.
59+
virtual Result<std::unordered_map<std::string, std::string>> Metadata() = 0;
5760
};
5861

5962
/// \brief A split of the file to read.

src/iceberg/parquet/parquet_reader.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <arrow/record_batch.h>
2727
#include <arrow/result.h>
2828
#include <arrow/type.h>
29+
#include <arrow/util/key_value_metadata.h>
2930
#include <parquet/arrow/reader.h>
3031
#include <parquet/arrow/schema.h>
3132
#include <parquet/file_reader.h>
@@ -185,6 +186,27 @@ class ParquetReader::Impl {
185186
return arrow_schema;
186187
}
187188

189+
Result<std::unordered_map<std::string, std::string>> Metadata() {
190+
if (reader_ == nullptr) {
191+
return Invalid("Reader is not opened");
192+
}
193+
194+
auto metadata = reader_->parquet_reader()->metadata();
195+
if (!metadata) {
196+
return Invalid("Failed to get Parquet file metadata");
197+
}
198+
199+
const auto& kv_metadata = metadata->key_value_metadata();
200+
if (!kv_metadata) {
201+
return std::unordered_map<std::string, std::string>{};
202+
}
203+
204+
std::unordered_map<std::string, std::string> metadata_map;
205+
kv_metadata->ToUnorderedMap(&metadata_map);
206+
207+
return metadata_map;
208+
}
209+
188210
private:
189211
Status InitReadContext() {
190212
context_ = std::make_unique<ReadContext>();
@@ -251,6 +273,10 @@ Result<std::optional<ArrowArray>> ParquetReader::Next() { return impl_->Next();
251273

252274
Result<ArrowSchema> ParquetReader::Schema() { return impl_->Schema(); }
253275

276+
Result<std::unordered_map<std::string, std::string>> ParquetReader::Metadata() {
277+
return impl_->Metadata();
278+
}
279+
254280
Status ParquetReader::Open(const ReaderOptions& options) {
255281
impl_ = std::make_unique<Impl>();
256282
return impl_->Open(options);

src/iceberg/parquet/parquet_reader.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class ICEBERG_BUNDLE_EXPORT ParquetReader : public Reader {
3939

4040
Result<ArrowSchema> Schema() final;
4141

42+
Result<std::unordered_map<std::string, std::string>> Metadata() final;
43+
4244
private:
4345
class Impl;
4446
std::unique_ptr<Impl> impl_;

0 commit comments

Comments
 (0)