2222#include < any>
2323#include < cstdint>
2424#include < map>
25+ #include < memory>
2526#include < optional>
2627#include < string>
27- #include < unordered_map>
2828#include < vector>
2929
30+ #include < iceberg/type_fwd.h>
31+
3032#include " iceberg/file_format.h"
3133#include " iceberg/iceberg_export.h"
3234#include " iceberg/result.h"
33- #include " iceberg/type_fwd.h"
35+ #include " iceberg/schema_field.h"
36+ #include " iceberg/type.h"
3437
3538namespace iceberg {
3639
@@ -92,7 +95,7 @@ struct ICEBERG_EXPORT DataFile {
9295 // / Partition data tuple, schema based on the partition spec output using partition
9396 // / field ids for the struct field ids
9497 // / TODO(zhjwpku): use StructLike to represent partition data tuple
95- std::map<std::string, std:: any> partition;
98+ std::any partition;
9699 // / Field id: 103
97100 // / Number of records in this file, or the cardinality of a deletion vector
98101 int64_t record_count = 0 ;
@@ -105,44 +108,36 @@ struct ICEBERG_EXPORT DataFile {
105108 // / Map from column id to the total size on disk of all regions that store the column.
106109 // / Does not include bytes necessary to read other columns, like footers. Leave null for
107110 // / row-oriented formats (Avro)
108- std::unordered_map <int32_t , int64_t > column_sizes;
111+ std::map <int32_t , int64_t > column_sizes;
109112 // / Field id: 109
110113 // / Key field id: 119
111114 // / Value field id: 120
112115 // / Map from column id to number of values in the column (including null and NaN values)
113- std::unordered_map <int32_t , int64_t > value_counts;
116+ std::map <int32_t , int64_t > value_counts;
114117 // / Field id: 110
115118 // / Key field id: 121
116119 // / Value field id: 122
117120 // / Map from column id to number of null values in the column
118- std::unordered_map <int32_t , int64_t > null_value_counts;
121+ std::map <int32_t , int64_t > null_value_counts;
119122 // / Field id: 137
120123 // / Key field id: 138
121124 // / Value field id: 139
122125 // / Map from column id to number of NaN values in the column
123- std::unordered_map <int32_t , int64_t > nan_value_counts;
126+ std::map <int32_t , int64_t > nan_value_counts;
124127 // / Field id: 125
125128 // / Key field id: 126
126129 // / Value field id: 127
127130 // / Map from column id to lower bound in the column serialized as binary.
128131 // / Each value must be less than or equal to all non-null, non-NaN values in the column
129132 // / for the file.
130- // /
131- // / Reference:
132- // / - [Binary single-value
133- // / serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
134- std::unordered_map<int32_t , std::vector<uint8_t >> lower_bounds;
133+ std::map<int32_t , std::vector<uint8_t >> lower_bounds;
135134 // / Field id: 128
136135 // / Key field id: 129
137136 // / Value field id: 130
138137 // / Map from column id to upper bound in the column serialized as binary.
139- // / Each value must be greater than or equal to all non-null, non-Nan values in the
138+ // / Each value must be greater than or equal to all non-null, non-NaN values in the
140139 // / column for the file.
141- // /
142- // / Reference:
143- // / - [Binary single-value
144- // / serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
145- std::unordered_map<int32_t , std::vector<uint8_t >> upper_bounds;
140+ std::map<int32_t , std::vector<uint8_t >> upper_bounds;
146141 // / Field id: 131
147142 // / Implementation-specific key metadata for encryption
148143 std::optional<std::vector<uint8_t >> key_metadata;
@@ -197,27 +192,80 @@ struct ICEBERG_EXPORT DataFile {
197192 // / present
198193 std::optional<int64_t > content_size_in_bytes;
199194
200- static const SchemaField CONTENT;
201- static const SchemaField FILE_PATH;
202- static const SchemaField FILE_FORMAT;
203- static const SchemaField RECORD_COUNT;
204- static const SchemaField FILE_SIZE;
205- static const SchemaField COLUMN_SIZES;
206- static const SchemaField VALUE_COUNTS;
207- static const SchemaField NULL_VALUE_COUNTS;
208- static const SchemaField NAN_VALUE_COUNTS;
209- static const SchemaField LOWER_BOUNDS;
210- static const SchemaField UPPER_BOUNDS;
211- static const SchemaField KEY_METADATA;
212- static const SchemaField SPLIT_OFFSETS;
213- static const SchemaField EQUALITY_IDS;
214- static const SchemaField SORT_ORDER_ID;
215- static const SchemaField FIRST_ROW_ID;
216- static const SchemaField REFERENCED_DATA_FILE;
217- static const SchemaField CONTENT_OFFSET;
218- static const SchemaField CONTENT_SIZE;
195+ inline static const SchemaField kContent =
196+ SchemaField::MakeRequired (134 , " content" , std::make_shared<IntType>());
197+ inline static const SchemaField kFilePath =
198+ SchemaField::MakeRequired (100 , " file_path" , std::make_shared<StringType>());
199+ inline static const SchemaField kFileFormat =
200+ SchemaField::MakeRequired (101 , " file_format" , std::make_shared<IntType>());
201+ inline static const SchemaField kRecordCount =
202+ SchemaField::MakeRequired (103 , " record_count" , std::make_shared<LongType>());
203+ inline static const SchemaField kFileSize =
204+ SchemaField::MakeRequired (104 , " file_size_in_bytes" , std::make_shared<LongType>());
205+ inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
206+ 108 , " column_sizes" ,
207+ std::make_shared<MapType>(
208+ SchemaField::MakeRequired (117 , std::string(MapType::kKeyName ),
209+ std::make_shared<IntType>()),
210+ SchemaField::MakeRequired(118 , std::string(MapType::kValueName ),
211+ std::make_shared<LongType>())));
212+ inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
213+ 109 , " value_counts" ,
214+ std::make_shared<MapType>(
215+ SchemaField::MakeRequired (119 , std::string(MapType::kKeyName ),
216+ std::make_shared<IntType>()),
217+ SchemaField::MakeRequired(120 , std::string(MapType::kValueName ),
218+ std::make_shared<LongType>())));
219+ inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
220+ 110 , " null_value_counts" ,
221+ std::make_shared<MapType>(
222+ SchemaField::MakeRequired (121 , std::string(MapType::kKeyName ),
223+ std::make_shared<IntType>()),
224+ SchemaField::MakeRequired(122 , std::string(MapType::kValueName ),
225+ std::make_shared<LongType>())));
226+ inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
227+ 137 , " nan_value_counts" ,
228+ std::make_shared<MapType>(
229+ SchemaField::MakeRequired (138 , std::string(MapType::kKeyName ),
230+ std::make_shared<IntType>()),
231+ SchemaField::MakeRequired(139 , std::string(MapType::kValueName ),
232+ std::make_shared<LongType>())));
233+ inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
234+ 125 , " lower_bounds" ,
235+ std::make_shared<MapType>(
236+ SchemaField::MakeRequired (126 , std::string(MapType::kKeyName ),
237+ std::make_shared<IntType>()),
238+ SchemaField::MakeRequired(127 , std::string(MapType::kValueName ),
239+ std::make_shared<BinaryType>())));
240+ inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
241+ 128 , " upper_bounds" ,
242+ std::make_shared<MapType>(
243+ SchemaField::MakeRequired (129 , std::string(MapType::kKeyName ),
244+ std::make_shared<IntType>()),
245+ SchemaField::MakeRequired(130 , std::string(MapType::kValueName ),
246+ std::make_shared<BinaryType>())));
247+ inline static const SchemaField kKeyMetadata =
248+ SchemaField::MakeOptional (131 , " key_metadata" , std::make_shared<BinaryType>());
249+ inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
250+ 132 , " split_offsets" ,
251+ std::make_shared<ListType>(SchemaField::MakeRequired(
252+ 133 , std::string(ListType::kElementName ), std::make_shared<LongType>())));
253+ inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
254+ 135 , " equality_ids" ,
255+ std::make_shared<ListType>(SchemaField::MakeRequired(
256+ 136 , std::string(ListType::kElementName ), std::make_shared<IntType>())));
257+ inline static const SchemaField kSortOrderId =
258+ SchemaField::MakeOptional (140 , " sort_order_id" , std::make_shared<IntType>());
259+ inline static const SchemaField kFirstRowId =
260+ SchemaField::MakeOptional (142 , " first_row_id" , std::make_shared<LongType>());
261+ inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
262+ 143 , " referenced_data_file" , std::make_shared<StringType>());
263+ inline static const SchemaField kContentOffset =
264+ SchemaField::MakeOptional (144 , " content_offset" , std::make_shared<LongType>());
265+ inline static const SchemaField kContentSize = SchemaField::MakeOptional(
266+ 145 , " content_size_in_bytes" , std::make_shared<LongType>());
219267
220- static StructType GetType ( StructType partition_type);
268+ static std::shared_ptr< StructType> Type (std::shared_ptr< StructType> partition_type);
221269};
222270
223271// / \brief A manifest is an immutable Avro file that lists data files or delete files,
@@ -244,13 +292,19 @@ struct ICEBERG_EXPORT ManifestEntry {
244292 // / File path, partition tuple, metrics, ...
245293 DataFile data_file;
246294
247- static const SchemaField STATUS;
248- static const SchemaField SNAPSHOT_ID;
249- static const SchemaField SEQUENCE_NUMBER;
250- static const SchemaField FILE_SEQUENCE_NUMBER;
295+ inline static const SchemaField kStatus =
296+ SchemaField::MakeRequired (0 , " status" , std::make_shared<IntType>());
297+ inline static const SchemaField kSnapshotId =
298+ SchemaField::MakeOptional (1 , " snapshot_id" , std::make_shared<LongType>());
299+ inline static const SchemaField kSequenceNumber =
300+ SchemaField::MakeOptional (3 , " sequence_number" , std::make_shared<LongType>());
301+ inline static const SchemaField kFileSequenceNumber =
302+ SchemaField::MakeOptional (4 , " file_sequence_number" , std::make_shared<LongType>());
251303
252- static StructType GetSchema (StructType partition_type);
253- static StructType GetSchemaFromDataFileType (StructType datafile_type);
304+ static std::shared_ptr<StructType> TypeFromPartitionType (
305+ std::shared_ptr<StructType> partition_type);
306+ static std::shared_ptr<StructType> TypeFromDataFileType (
307+ std::shared_ptr<StructType> datafile_type);
254308};
255309
256310} // namespace iceberg
0 commit comments