Skip to content

Commit fd91bb1

Browse files
committed
fix: more review comments
Signed-off-by: Junwang Zhao <[email protected]>
1 parent c817b2d commit fd91bb1

File tree

5 files changed

+203
-279
lines changed

5 files changed

+203
-279
lines changed

src/iceberg/manifest_entry.cc

Lines changed: 30 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -26,134 +26,41 @@
2626
#include "iceberg/type.h"
2727

2828
namespace iceberg {
29-
const SchemaField DataFile::CONTENT =
30-
SchemaField::MakeRequired(134, "content", std::make_shared<IntType>());
31-
const SchemaField DataFile::FILE_PATH =
32-
SchemaField::MakeRequired(100, "file_path", std::make_shared<StringType>());
33-
const SchemaField DataFile::FILE_FORMAT =
34-
SchemaField::MakeRequired(101, "file_format", std::make_shared<IntType>());
35-
const SchemaField DataFile::RECORD_COUNT =
36-
SchemaField::MakeRequired(103, "record_count", std::make_shared<LongType>());
37-
const SchemaField DataFile::FILE_SIZE =
38-
SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared<LongType>());
39-
const SchemaField DataFile::COLUMN_SIZES = SchemaField::MakeOptional(
40-
108, "column_sizes",
41-
std::make_shared<MapType>(
42-
SchemaField::MakeRequired(117, std::string(MapType::kKeyName),
43-
std::make_shared<IntType>()),
44-
SchemaField::MakeRequired(118, std::string(MapType::kValueName),
45-
std::make_shared<LongType>())));
46-
const SchemaField DataFile::VALUE_COUNTS = SchemaField::MakeOptional(
47-
109, "value_counts",
48-
std::make_shared<MapType>(
49-
SchemaField::MakeRequired(119, std::string(MapType::kKeyName),
50-
std::make_shared<IntType>()),
51-
SchemaField::MakeRequired(120, std::string(MapType::kValueName),
52-
std::make_shared<LongType>())));
53-
const SchemaField DataFile::NULL_VALUE_COUNTS = SchemaField::MakeOptional(
54-
110, "null_value_counts",
55-
std::make_shared<MapType>(
56-
SchemaField::MakeRequired(121, std::string(MapType::kKeyName),
57-
std::make_shared<IntType>()),
58-
SchemaField::MakeRequired(122, std::string(MapType::kValueName),
59-
std::make_shared<LongType>())));
60-
const SchemaField DataFile::NAN_VALUE_COUNTS = SchemaField::MakeOptional(
61-
137, "nan_value_counts",
62-
std::make_shared<MapType>(
63-
SchemaField::MakeRequired(138, std::string(MapType::kKeyName),
64-
std::make_shared<IntType>()),
65-
SchemaField::MakeRequired(139, std::string(MapType::kValueName),
66-
std::make_shared<LongType>())));
67-
const SchemaField DataFile::LOWER_BOUNDS = SchemaField::MakeOptional(
68-
125, "lower_bounds",
69-
std::make_shared<MapType>(
70-
SchemaField::MakeRequired(126, std::string(MapType::kKeyName),
71-
std::make_shared<IntType>()),
72-
SchemaField::MakeRequired(127, std::string(MapType::kValueName),
73-
std::make_shared<BinaryType>())));
74-
const SchemaField DataFile::UPPER_BOUNDS = SchemaField::MakeOptional(
75-
128, "upper_bounds",
76-
std::make_shared<MapType>(
77-
SchemaField::MakeRequired(129, std::string(MapType::kKeyName),
78-
std::make_shared<IntType>()),
79-
SchemaField::MakeRequired(130, std::string(MapType::kValueName),
80-
std::make_shared<BinaryType>())));
81-
const SchemaField DataFile::KEY_METADATA =
82-
SchemaField::MakeOptional(131, "key_metadata", std::make_shared<BinaryType>());
83-
const SchemaField DataFile::SPLIT_OFFSETS = SchemaField::MakeOptional(
84-
132, "split_offsets",
85-
std::make_shared<ListType>(SchemaField::MakeRequired(
86-
133, std::string(ListType::kElementName), std::make_shared<LongType>())));
87-
const SchemaField DataFile::EQUALITY_IDS = SchemaField::MakeOptional(
88-
135, "equality_ids",
89-
std::make_shared<ListType>(SchemaField::MakeRequired(
90-
136, std::string(ListType::kElementName), std::make_shared<IntType>())));
91-
const SchemaField DataFile::SORT_ORDER_ID =
92-
SchemaField::MakeOptional(140, "sort_order_id", std::make_shared<IntType>());
93-
const SchemaField DataFile::FIRST_ROW_ID =
94-
SchemaField::MakeOptional(142, "first_row_id", std::make_shared<LongType>());
95-
const SchemaField DataFile::REFERENCED_DATA_FILE = SchemaField::MakeOptional(
96-
143, "referenced_data_file", std::make_shared<StringType>());
97-
const SchemaField DataFile::CONTENT_OFFSET =
98-
SchemaField::MakeOptional(144, "content_offset", std::make_shared<LongType>());
99-
const SchemaField DataFile::CONTENT_SIZE =
100-
SchemaField::MakeOptional(145, "content_size_in_bytes", std::make_shared<LongType>());
10129

102-
StructType DataFile::GetType(StructType partition_type) {
103-
std::vector<SchemaField> fields;
104-
105-
fields.push_back(CONTENT);
106-
fields.push_back(FILE_PATH);
107-
fields.push_back(FILE_FORMAT);
108-
fields.push_back(SchemaField::MakeRequired(
109-
102, "partition", std::make_shared<StructType>(partition_type)));
110-
fields.push_back(RECORD_COUNT);
111-
fields.push_back(FILE_SIZE);
112-
fields.push_back(COLUMN_SIZES);
113-
fields.push_back(VALUE_COUNTS);
114-
fields.push_back(NULL_VALUE_COUNTS);
115-
fields.push_back(NAN_VALUE_COUNTS);
116-
fields.push_back(LOWER_BOUNDS);
117-
fields.push_back(UPPER_BOUNDS);
118-
fields.push_back(KEY_METADATA);
119-
fields.push_back(SPLIT_OFFSETS);
120-
fields.push_back(EQUALITY_IDS);
121-
fields.push_back(SORT_ORDER_ID);
122-
fields.push_back(FIRST_ROW_ID);
123-
fields.push_back(REFERENCED_DATA_FILE);
124-
fields.push_back(CONTENT_OFFSET);
125-
fields.push_back(CONTENT_SIZE);
126-
127-
return StructType(std::move(fields));
30+
std::shared_ptr<StructType> DataFile::Type(std::shared_ptr<StructType> partition_type) {
31+
return std::make_shared<StructType>(std::vector<SchemaField>{
32+
kContent,
33+
kFilePath,
34+
kFileFormat,
35+
SchemaField::MakeRequired(102, "partition", std::move(partition_type)),
36+
kRecordCount,
37+
kFileSize,
38+
kColumnSizes,
39+
kValueCounts,
40+
kNullValueCounts,
41+
kNanValueCounts,
42+
kLowerBounds,
43+
kUpperBounds,
44+
kKeyMetadata,
45+
kSplitOffsets,
46+
kEqualityIds,
47+
kSortOrderId,
48+
kFirstRowId,
49+
kReferencedDataFile,
50+
kContentOffset,
51+
kContentSize});
12852
}
12953

130-
const SchemaField ManifestEntry::STATUS =
131-
SchemaField::MakeRequired(0, "status", std::make_shared<IntType>());
132-
const SchemaField ManifestEntry::SNAPSHOT_ID =
133-
SchemaField::MakeOptional(1, "snapshot_id", std::make_shared<LongType>());
134-
const SchemaField ManifestEntry::SEQUENCE_NUMBER =
135-
SchemaField::MakeOptional(3, "sequence_number", std::make_shared<LongType>());
136-
const SchemaField ManifestEntry::FILE_SEQUENCE_NUMBER =
137-
SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared<LongType>());
138-
139-
StructType ManifestEntry::GetSchema(StructType partition_type) {
140-
return GetSchemaFromDataFileType(DataFile::GetType(partition_type));
54+
std::shared_ptr<StructType> ManifestEntry::TypeFromPartitionType(
55+
std::shared_ptr<StructType> partition_type) {
56+
return TypeFromDataFileType(DataFile::Type(std::move(partition_type)));
14157
}
14258

143-
StructType ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) {
144-
std::vector<SchemaField> fields;
145-
146-
fields.push_back(STATUS);
147-
fields.push_back(SNAPSHOT_ID);
148-
fields.push_back(SEQUENCE_NUMBER);
149-
fields.push_back(FILE_SEQUENCE_NUMBER);
150-
151-
// Add the data file schema
152-
auto data_file_type_field = SchemaField::MakeRequired(
153-
2, "data_file", std::make_shared<StructType>(DataFile::GetType(datafile_type)));
154-
fields.push_back(data_file_type_field);
155-
156-
return StructType(std::move(fields));
59+
std::shared_ptr<StructType> ManifestEntry::TypeFromDataFileType(
60+
std::shared_ptr<StructType> datafile_type) {
61+
return std::make_shared<StructType>(std::vector<SchemaField>{
62+
kStatus, kSnapshotId, kSequenceNumber, kFileSequenceNumber,
63+
SchemaField::MakeRequired(2, "data_file", std::move(datafile_type))});
15764
}
15865

15966
} // namespace iceberg

src/iceberg/manifest_entry.h

Lines changed: 98 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,18 @@
2222
#include <any>
2323
#include <cstdint>
2424
#include <map>
25+
#include <memory>
2526
#include <optional>
2627
#include <string>
27-
#include <unordered_map>
2828
#include <vector>
2929

30+
#include <iceberg/type_fwd.h>
31+
3032
#include "iceberg/file_format.h"
3133
#include "iceberg/iceberg_export.h"
3234
#include "iceberg/result.h"
33-
#include "iceberg/type_fwd.h"
35+
#include "iceberg/schema_field.h"
36+
#include "iceberg/type.h"
3437

3538
namespace iceberg {
3639

@@ -92,7 +95,7 @@ struct ICEBERG_EXPORT DataFile {
9295
/// Partition data tuple, schema based on the partition spec output using partition
9396
/// field ids for the struct field ids
9497
/// TODO(zhjwpku): use StructLike to represent partition data tuple
95-
std::map<std::string, std::any> partition;
98+
std::any partition;
9699
/// Field id: 103
97100
/// Number of records in this file, or the cardinality of a deletion vector
98101
int64_t record_count = 0;
@@ -105,44 +108,36 @@ struct ICEBERG_EXPORT DataFile {
105108
/// Map from column id to the total size on disk of all regions that store the column.
106109
/// Does not include bytes necessary to read other columns, like footers. Leave null for
107110
/// row-oriented formats (Avro)
108-
std::unordered_map<int32_t, int64_t> column_sizes;
111+
std::map<int32_t, int64_t> column_sizes;
109112
/// Field id: 109
110113
/// Key field id: 119
111114
/// Value field id: 120
112115
/// Map from column id to number of values in the column (including null and NaN values)
113-
std::unordered_map<int32_t, int64_t> value_counts;
116+
std::map<int32_t, int64_t> value_counts;
114117
/// Field id: 110
115118
/// Key field id: 121
116119
/// Value field id: 122
117120
/// Map from column id to number of null values in the column
118-
std::unordered_map<int32_t, int64_t> null_value_counts;
121+
std::map<int32_t, int64_t> null_value_counts;
119122
/// Field id: 137
120123
/// Key field id: 138
121124
/// Value field id: 139
122125
/// Map from column id to number of NaN values in the column
123-
std::unordered_map<int32_t, int64_t> nan_value_counts;
126+
std::map<int32_t, int64_t> nan_value_counts;
124127
/// Field id: 125
125128
/// Key field id: 126
126129
/// Value field id: 127
127130
/// Map from column id to lower bound in the column serialized as binary.
128131
/// Each value must be less than or equal to all non-null, non-NaN values in the column
129132
/// for the file.
130-
///
131-
/// Reference:
132-
/// - [Binary single-value
133-
/// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
134-
std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds;
133+
std::map<int32_t, std::vector<uint8_t>> lower_bounds;
135134
/// Field id: 128
136135
/// Key field id: 129
137136
/// Value field id: 130
138137
/// Map from column id to upper bound in the column serialized as binary.
139-
/// Each value must be greater than or equal to all non-null, non-Nan values in the
138+
/// Each value must be greater than or equal to all non-null, non-NaN values in the
140139
/// column for the file.
141-
///
142-
/// Reference:
143-
/// - [Binary single-value
144-
/// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
145-
std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds;
140+
std::map<int32_t, std::vector<uint8_t>> upper_bounds;
146141
/// Field id: 131
147142
/// Implementation-specific key metadata for encryption
148143
std::optional<std::vector<uint8_t>> key_metadata;
@@ -197,27 +192,80 @@ struct ICEBERG_EXPORT DataFile {
197192
/// present
198193
std::optional<int64_t> content_size_in_bytes;
199194

200-
static const SchemaField CONTENT;
201-
static const SchemaField FILE_PATH;
202-
static const SchemaField FILE_FORMAT;
203-
static const SchemaField RECORD_COUNT;
204-
static const SchemaField FILE_SIZE;
205-
static const SchemaField COLUMN_SIZES;
206-
static const SchemaField VALUE_COUNTS;
207-
static const SchemaField NULL_VALUE_COUNTS;
208-
static const SchemaField NAN_VALUE_COUNTS;
209-
static const SchemaField LOWER_BOUNDS;
210-
static const SchemaField UPPER_BOUNDS;
211-
static const SchemaField KEY_METADATA;
212-
static const SchemaField SPLIT_OFFSETS;
213-
static const SchemaField EQUALITY_IDS;
214-
static const SchemaField SORT_ORDER_ID;
215-
static const SchemaField FIRST_ROW_ID;
216-
static const SchemaField REFERENCED_DATA_FILE;
217-
static const SchemaField CONTENT_OFFSET;
218-
static const SchemaField CONTENT_SIZE;
195+
inline static const SchemaField kContent =
196+
SchemaField::MakeRequired(134, "content", std::make_shared<IntType>());
197+
inline static const SchemaField kFilePath =
198+
SchemaField::MakeRequired(100, "file_path", std::make_shared<StringType>());
199+
inline static const SchemaField kFileFormat =
200+
SchemaField::MakeRequired(101, "file_format", std::make_shared<IntType>());
201+
inline static const SchemaField kRecordCount =
202+
SchemaField::MakeRequired(103, "record_count", std::make_shared<LongType>());
203+
inline static const SchemaField kFileSize =
204+
SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared<LongType>());
205+
inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
206+
108, "column_sizes",
207+
std::make_shared<MapType>(
208+
SchemaField::MakeRequired(117, std::string(MapType::kKeyName),
209+
std::make_shared<IntType>()),
210+
SchemaField::MakeRequired(118, std::string(MapType::kValueName),
211+
std::make_shared<LongType>())));
212+
inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
213+
109, "value_counts",
214+
std::make_shared<MapType>(
215+
SchemaField::MakeRequired(119, std::string(MapType::kKeyName),
216+
std::make_shared<IntType>()),
217+
SchemaField::MakeRequired(120, std::string(MapType::kValueName),
218+
std::make_shared<LongType>())));
219+
inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
220+
110, "null_value_counts",
221+
std::make_shared<MapType>(
222+
SchemaField::MakeRequired(121, std::string(MapType::kKeyName),
223+
std::make_shared<IntType>()),
224+
SchemaField::MakeRequired(122, std::string(MapType::kValueName),
225+
std::make_shared<LongType>())));
226+
inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
227+
137, "nan_value_counts",
228+
std::make_shared<MapType>(
229+
SchemaField::MakeRequired(138, std::string(MapType::kKeyName),
230+
std::make_shared<IntType>()),
231+
SchemaField::MakeRequired(139, std::string(MapType::kValueName),
232+
std::make_shared<LongType>())));
233+
inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
234+
125, "lower_bounds",
235+
std::make_shared<MapType>(
236+
SchemaField::MakeRequired(126, std::string(MapType::kKeyName),
237+
std::make_shared<IntType>()),
238+
SchemaField::MakeRequired(127, std::string(MapType::kValueName),
239+
std::make_shared<BinaryType>())));
240+
inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
241+
128, "upper_bounds",
242+
std::make_shared<MapType>(
243+
SchemaField::MakeRequired(129, std::string(MapType::kKeyName),
244+
std::make_shared<IntType>()),
245+
SchemaField::MakeRequired(130, std::string(MapType::kValueName),
246+
std::make_shared<BinaryType>())));
247+
inline static const SchemaField kKeyMetadata =
248+
SchemaField::MakeOptional(131, "key_metadata", std::make_shared<BinaryType>());
249+
inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
250+
132, "split_offsets",
251+
std::make_shared<ListType>(SchemaField::MakeRequired(
252+
133, std::string(ListType::kElementName), std::make_shared<LongType>())));
253+
inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
254+
135, "equality_ids",
255+
std::make_shared<ListType>(SchemaField::MakeRequired(
256+
136, std::string(ListType::kElementName), std::make_shared<IntType>())));
257+
inline static const SchemaField kSortOrderId =
258+
SchemaField::MakeOptional(140, "sort_order_id", std::make_shared<IntType>());
259+
inline static const SchemaField kFirstRowId =
260+
SchemaField::MakeOptional(142, "first_row_id", std::make_shared<LongType>());
261+
inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
262+
143, "referenced_data_file", std::make_shared<StringType>());
263+
inline static const SchemaField kContentOffset =
264+
SchemaField::MakeOptional(144, "content_offset", std::make_shared<LongType>());
265+
inline static const SchemaField kContentSize = SchemaField::MakeOptional(
266+
145, "content_size_in_bytes", std::make_shared<LongType>());
219267

220-
static StructType GetType(StructType partition_type);
268+
static std::shared_ptr<StructType> Type(std::shared_ptr<StructType> partition_type);
221269
};
222270

223271
/// \brief A manifest is an immutable Avro file that lists data files or delete files,
@@ -244,13 +292,19 @@ struct ICEBERG_EXPORT ManifestEntry {
244292
/// File path, partition tuple, metrics, ...
245293
DataFile data_file;
246294

247-
static const SchemaField STATUS;
248-
static const SchemaField SNAPSHOT_ID;
249-
static const SchemaField SEQUENCE_NUMBER;
250-
static const SchemaField FILE_SEQUENCE_NUMBER;
295+
inline static const SchemaField kStatus =
296+
SchemaField::MakeRequired(0, "status", std::make_shared<IntType>());
297+
inline static const SchemaField kSnapshotId =
298+
SchemaField::MakeOptional(1, "snapshot_id", std::make_shared<LongType>());
299+
inline static const SchemaField kSequenceNumber =
300+
SchemaField::MakeOptional(3, "sequence_number", std::make_shared<LongType>());
301+
inline static const SchemaField kFileSequenceNumber =
302+
SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared<LongType>());
251303

252-
static StructType GetSchema(StructType partition_type);
253-
static StructType GetSchemaFromDataFileType(StructType datafile_type);
304+
static std::shared_ptr<StructType> TypeFromPartitionType(
305+
std::shared_ptr<StructType> partition_type);
306+
static std::shared_ptr<StructType> TypeFromDataFileType(
307+
std::shared_ptr<StructType> datafile_type);
254308
};
255309

256310
} // namespace iceberg

0 commit comments

Comments
 (0)