@@ -138,7 +138,7 @@ struct ICEBERG_EXPORT DataFile {
138138 std::map<int32_t , std::vector<uint8_t >> upper_bounds;
139139 // / Field id: 131
140140 // / Implementation-specific key metadata for encryption
141- std::optional<std:: vector<uint8_t > > key_metadata;
141+ std::vector<uint8_t > key_metadata;
142142 // / Field id: 132
143143 // / Element Field id: 133
144144 // / Split offsets for the data file. For example, all row group offsets in a Parquet
@@ -190,78 +190,95 @@ struct ICEBERG_EXPORT DataFile {
190190 // / present
191191 std::optional<int64_t > content_size_in_bytes;
192192
193- inline static const SchemaField kContent =
194- SchemaField::MakeRequired (134 , " content" , std::make_shared<IntType>());
195- inline static const SchemaField kFilePath =
196- SchemaField::MakeRequired (100 , " file_path" , std::make_shared<StringType>());
193+ inline static const SchemaField kContent = SchemaField::MakeRequired(
194+ 134 , " content" , std::make_shared<IntType>(),
195+ " Contents of the file: 0=data, 1=position deletes, 2=equality deletes" );
196+ inline static const SchemaField kFilePath = SchemaField::MakeRequired(
197+ 100 , " file_path" , std::make_shared<StringType>(), " Location URI with FS scheme" );
197198 inline static const SchemaField kFileFormat =
198- SchemaField::MakeRequired (101 , " file_format" , std::make_shared<IntType>());
199- inline static const SchemaField kRecordCount =
200- SchemaField::MakeRequired (103 , " record_count" , std::make_shared<LongType>());
199+ SchemaField::MakeRequired (101 , " file_format" , std::make_shared<IntType>(),
200+ "File format name: avro, orc, or parquet");
201+ inline static const SchemaField kRecordCount = SchemaField::MakeRequired(
202+ 103 , " record_count" , std::make_shared<LongType>(), " Number of records in the file" );
201203 inline static const SchemaField kFileSize =
202- SchemaField::MakeRequired (104 , " file_size_in_bytes" , std::make_shared<LongType>());
204+ SchemaField::MakeRequired (104 , " file_size_in_bytes" , std::make_shared<LongType>(),
205+ "Total file size in bytes");
203206 inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
204207 108 , " column_sizes" ,
205208 std::make_shared<MapType>(
206209 SchemaField::MakeRequired (117 , std::string(MapType::kKeyName ),
207210 std::make_shared<IntType>()),
208211 SchemaField::MakeRequired(118 , std::string(MapType::kValueName ),
209- std::make_shared<LongType>())));
212+ std::make_shared<LongType>())),
213+ "Map of column id to total size on disk");
210214 inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
211215 109 , " value_counts" ,
212216 std::make_shared<MapType>(
213217 SchemaField::MakeRequired (119 , std::string(MapType::kKeyName ),
214218 std::make_shared<IntType>()),
215219 SchemaField::MakeRequired(120 , std::string(MapType::kValueName ),
216- std::make_shared<LongType>())));
220+ std::make_shared<LongType>())),
221+ "Map of column id to total count, including null and NaN");
217222 inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
218223 110 , " null_value_counts" ,
219224 std::make_shared<MapType>(
220225 SchemaField::MakeRequired (121 , std::string(MapType::kKeyName ),
221226 std::make_shared<IntType>()),
222227 SchemaField::MakeRequired(122 , std::string(MapType::kValueName ),
223- std::make_shared<LongType>())));
228+ std::make_shared<LongType>())),
229+ "Map of column id to null value count");
224230 inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
225231 137 , " nan_value_counts" ,
226232 std::make_shared<MapType>(
227233 SchemaField::MakeRequired (138 , std::string(MapType::kKeyName ),
228234 std::make_shared<IntType>()),
229235 SchemaField::MakeRequired(139 , std::string(MapType::kValueName ),
230- std::make_shared<LongType>())));
236+ std::make_shared<LongType>())),
237+ "Map of column id to number of NaN values in the column");
231238 inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
232239 125 , " lower_bounds" ,
233240 std::make_shared<MapType>(
234241 SchemaField::MakeRequired (126 , std::string(MapType::kKeyName ),
235242 std::make_shared<IntType>()),
236243 SchemaField::MakeRequired(127 , std::string(MapType::kValueName ),
237- std::make_shared<BinaryType>())));
244+ std::make_shared<BinaryType>())),
245+ "Map of column id to lower bound");
238246 inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
239247 128 , " upper_bounds" ,
240248 std::make_shared<MapType>(
241249 SchemaField::MakeRequired (129 , std::string(MapType::kKeyName ),
242250 std::make_shared<IntType>()),
243251 SchemaField::MakeRequired(130 , std::string(MapType::kValueName ),
244- std::make_shared<BinaryType>())));
252+ std::make_shared<BinaryType>())),
253+ "Map of column id to upper bound");
245254 inline static const SchemaField kKeyMetadata =
246- SchemaField::MakeOptional (131 , " key_metadata" , std::make_shared<BinaryType>());
255+ SchemaField::MakeOptional (131 , " key_metadata" , std::make_shared<BinaryType>(),
256+ "Encryption key metadata blob");
247257 inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
248258 132 , " split_offsets" ,
249259 std::make_shared<ListType>(SchemaField::MakeRequired(
250- 133 , std::string(ListType::kElementName ), std::make_shared<LongType>())));
260+ 133 , std::string(ListType::kElementName ), std::make_shared<LongType>())),
261+ " Splittable offsets" );
251262 inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
252263 135 , " equality_ids" ,
253264 std::make_shared<ListType>(SchemaField::MakeRequired(
254- 136 , std::string(ListType::kElementName ), std::make_shared<IntType>())));
255- inline static const SchemaField kSortOrderId =
256- SchemaField::MakeOptional (140 , " sort_order_id" , std::make_shared<IntType>());
265+ 136 , std::string(ListType::kElementName ), std::make_shared<IntType>())),
266+ " Equality comparison field IDs" );
267+ inline static const SchemaField kSortOrderId = SchemaField::MakeOptional(
268+ 140 , " sort_order_id" , std::make_shared<IntType>(), " Sort order ID" );
257269 inline static const SchemaField kFirstRowId =
258- SchemaField::MakeOptional (142 , " first_row_id" , std::make_shared<LongType>());
270+ SchemaField::MakeOptional (142 , " first_row_id" , std::make_shared<LongType>(),
271+ "Starting row ID to assign to new rows");
259272 inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
260- 143 , " referenced_data_file" , std::make_shared<StringType>());
273+ 143 , " referenced_data_file" , std::make_shared<StringType>(),
274+ " Fully qualified location (URI with FS scheme) of a data file that all deletes "
275+ " reference" );
261276 inline static const SchemaField kContentOffset =
262- SchemaField::MakeOptional (144 , " content_offset" , std::make_shared<LongType>());
277+ SchemaField::MakeOptional (144 , " content_offset" , std::make_shared<LongType>(),
278+ "The offset in the file where the content starts");
263279 inline static const SchemaField kContentSize = SchemaField::MakeOptional(
264- 145 , " content_size_in_bytes" , std::make_shared<LongType>());
280+ 145 , " content_size_in_bytes" , std::make_shared<LongType>(),
281+ " The length of referenced content stored in the file" );
265282
266283 static std::shared_ptr<StructType> Type (std::shared_ptr<StructType> partition_type);
267284};
0 commit comments