Skip to content

Commit 1de36a3

Browse files
committed
feat: add manifest related structures
Add DataFile, ManifestEntry, ManifestFile, and ManifestList to Iceberg core. Support for parsing these data structures from Avro file will be added in future PRs.
1 parent b839b3b commit 1de36a3

File tree

8 files changed

+742
-2
lines changed

8 files changed

+742
-2
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ set(ICEBERG_SOURCES
2323
expression/expression.cc
2424
file_reader.cc
2525
json_internal.cc
26+
manifest_entry.cc
27+
manifest_list.cc
2628
partition_field.cc
2729
partition_spec.cc
2830
schema.cc

src/iceberg/file_format.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <string_view>
2626

2727
#include "iceberg/iceberg_export.h"
28+
#include "iceberg/result.h"
2829

2930
namespace iceberg {
3031

@@ -50,4 +51,14 @@ ICEBERG_EXPORT inline std::string_view ToString(FileFormatType format_type) {
5051
}
5152
}
5253

54+
/// \brief Convert a string to a FileFormatType
55+
ICEBERG_EXPORT constexpr Result<FileFormatType> FileFormatTypeFromString(
56+
std::string_view str) noexcept {
57+
if (str == "parquet") return FileFormatType::kParquet;
58+
if (str == "avro") return FileFormatType::kAvro;
59+
if (str == "orc") return FileFormatType::kOrc;
60+
if (str == "puffin") return FileFormatType::kPuffin;
61+
return InvalidArgument("Invalid file format type: {}", str);
62+
}
63+
5364
} // namespace iceberg

src/iceberg/manifest_entry.cc

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/manifest_entry.h"
21+
22+
#include <memory>
23+
#include <optional>
24+
#include <vector>
25+
26+
#include "iceberg/schema.h"
27+
#include "iceberg/schema_field.h"
28+
#include "iceberg/type.h"
29+
30+
namespace iceberg {
31+
const SchemaField DataFile::CONTENT =
32+
SchemaField::MakeRequired(134, "content", std::make_shared<IntType>());
33+
const SchemaField DataFile::FILE_PATH =
34+
SchemaField::MakeRequired(100, "file_path", std::make_shared<StringType>());
35+
const SchemaField DataFile::FILE_FORMAT =
36+
SchemaField::MakeRequired(101, "file_format", std::make_shared<IntType>());
37+
const SchemaField DataFile::RECORD_COUNT =
38+
SchemaField::MakeRequired(103, "record_count", std::make_shared<LongType>());
39+
const SchemaField DataFile::FILE_SIZE =
40+
SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared<LongType>());
41+
const SchemaField DataFile::COLUMN_SIZES = SchemaField::MakeOptional(
42+
108, "column_sizes",
43+
std::make_shared<MapType>(
44+
SchemaField::MakeRequired(117, std::string(MapType::kKeyName),
45+
std::make_shared<IntType>()),
46+
SchemaField::MakeRequired(118, std::string(MapType::kValueName),
47+
std::make_shared<LongType>())));
48+
const SchemaField DataFile::VALUE_COUNTS = SchemaField::MakeOptional(
49+
109, "value_counts",
50+
std::make_shared<MapType>(
51+
SchemaField::MakeRequired(119, std::string(MapType::kKeyName),
52+
std::make_shared<IntType>()),
53+
SchemaField::MakeRequired(120, std::string(MapType::kValueName),
54+
std::make_shared<LongType>())));
55+
const SchemaField DataFile::NULL_VALUE_COUNTS = SchemaField::MakeOptional(
56+
110, "null_value_counts",
57+
std::make_shared<MapType>(
58+
SchemaField::MakeRequired(121, std::string(MapType::kKeyName),
59+
std::make_shared<IntType>()),
60+
SchemaField::MakeRequired(122, std::string(MapType::kValueName),
61+
std::make_shared<LongType>())));
62+
const SchemaField DataFile::NAN_VALUE_COUNTS = SchemaField::MakeOptional(
63+
137, "nan_value_counts",
64+
std::make_shared<MapType>(
65+
SchemaField::MakeRequired(138, std::string(MapType::kKeyName),
66+
std::make_shared<IntType>()),
67+
SchemaField::MakeRequired(139, std::string(MapType::kValueName),
68+
std::make_shared<LongType>())));
69+
const SchemaField DataFile::LOWER_BOUNDS = SchemaField::MakeOptional(
70+
125, "lower_bounds",
71+
std::make_shared<MapType>(
72+
SchemaField::MakeRequired(126, std::string(MapType::kKeyName),
73+
std::make_shared<IntType>()),
74+
SchemaField::MakeRequired(127, std::string(MapType::kValueName),
75+
std::make_shared<BinaryType>())));
76+
const SchemaField DataFile::UPPER_BOUNDS = SchemaField::MakeOptional(
77+
128, "upper_bounds",
78+
std::make_shared<MapType>(
79+
SchemaField::MakeRequired(129, std::string(MapType::kKeyName),
80+
std::make_shared<IntType>()),
81+
SchemaField::MakeRequired(130, std::string(MapType::kValueName),
82+
std::make_shared<BinaryType>())));
83+
const SchemaField DataFile::KEY_METADATA =
84+
SchemaField::MakeOptional(131, "key_metadata", std::make_shared<BinaryType>());
85+
const SchemaField DataFile::SPLIT_OFFSETS = SchemaField::MakeOptional(
86+
132, "split_offsets",
87+
std::make_shared<ListType>(SchemaField::MakeRequired(
88+
133, std::string(ListType::kElementName), std::make_shared<LongType>())));
89+
const SchemaField DataFile::EQUALITY_IDS = SchemaField::MakeOptional(
90+
135, "equality_ids",
91+
std::make_shared<ListType>(SchemaField::MakeRequired(
92+
136, std::string(ListType::kElementName), std::make_shared<IntType>())));
93+
const SchemaField DataFile::SORT_ORDER_ID =
94+
SchemaField::MakeOptional(140, "sort_order_id", std::make_shared<IntType>());
95+
const SchemaField DataFile::FIRST_ROW_ID =
96+
SchemaField::MakeOptional(142, "first_row_id", std::make_shared<LongType>());
97+
const SchemaField DataFile::REFERENCED_DATA_FILE = SchemaField::MakeOptional(
98+
143, "referenced_data_file", std::make_shared<StringType>());
99+
const SchemaField DataFile::CONTENT_OFFSET =
100+
SchemaField::MakeOptional(144, "content_offset", std::make_shared<LongType>());
101+
const SchemaField DataFile::CONTENT_SIZE =
102+
SchemaField::MakeOptional(145, "content_size_in_bytes", std::make_shared<LongType>());
103+
104+
StructType DataFile::GetType(StructType partition_type) {
105+
std::vector<SchemaField> fields;
106+
107+
fields.push_back(CONTENT);
108+
fields.push_back(FILE_PATH);
109+
fields.push_back(FILE_FORMAT);
110+
fields.push_back(SchemaField::MakeRequired(
111+
102, "partition", std::make_shared<StructType>(partition_type)));
112+
fields.push_back(RECORD_COUNT);
113+
fields.push_back(FILE_SIZE);
114+
fields.push_back(COLUMN_SIZES);
115+
fields.push_back(VALUE_COUNTS);
116+
fields.push_back(NULL_VALUE_COUNTS);
117+
fields.push_back(NAN_VALUE_COUNTS);
118+
fields.push_back(LOWER_BOUNDS);
119+
fields.push_back(UPPER_BOUNDS);
120+
fields.push_back(KEY_METADATA);
121+
fields.push_back(SPLIT_OFFSETS);
122+
fields.push_back(EQUALITY_IDS);
123+
fields.push_back(SORT_ORDER_ID);
124+
fields.push_back(FIRST_ROW_ID);
125+
fields.push_back(REFERENCED_DATA_FILE);
126+
fields.push_back(CONTENT_OFFSET);
127+
fields.push_back(CONTENT_SIZE);
128+
129+
return StructType(std::move(fields));
130+
}
131+
132+
const SchemaField ManifestEntry::STATUS =
133+
SchemaField::MakeRequired(0, "status", std::make_shared<IntType>());
134+
const SchemaField ManifestEntry::SNAPSHOT_ID =
135+
SchemaField::MakeOptional(1, "snapshot_id", std::make_shared<LongType>());
136+
const SchemaField ManifestEntry::SEQUENCE_NUMBER =
137+
SchemaField::MakeOptional(3, "sequence_number", std::make_shared<LongType>());
138+
const SchemaField ManifestEntry::FILE_SEQUENCE_NUMBER =
139+
SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared<LongType>());
140+
141+
Schema ManifestEntry::GetSchema(StructType partition_type) {
142+
return GetSchemaFromDataFileType(DataFile::GetType(partition_type));
143+
}
144+
145+
Schema ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) {
146+
std::vector<SchemaField> fields;
147+
148+
fields.push_back(STATUS);
149+
fields.push_back(SNAPSHOT_ID);
150+
fields.push_back(SEQUENCE_NUMBER);
151+
fields.push_back(FILE_SEQUENCE_NUMBER);
152+
153+
// Add the data file schema
154+
auto data_file_type_field = SchemaField::MakeRequired(
155+
2, "data_file", std::make_shared<StructType>(DataFile::GetType(datafile_type)));
156+
fields.push_back(data_file_type_field);
157+
158+
return {std::move(fields), /*schema_id=*/std::nullopt};
159+
}
160+
161+
} // namespace iceberg

0 commit comments

Comments
 (0)