@@ -289,6 +289,129 @@ def __repr__(self) -> str:
289
289
doc = "ID representing sort order for this file" ,
290
290
),
291
291
),
292
+ 3 : StructType (
293
+ NestedField (
294
+ field_id = 134 ,
295
+ name = "content" ,
296
+ field_type = IntegerType (),
297
+ required = True ,
298
+ doc = "File format name: avro, orc, or parquet" ,
299
+ initial_default = DataFileContent .DATA ,
300
+ ),
301
+ NestedField (field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme" ),
302
+ NestedField (
303
+ field_id = 101 ,
304
+ name = "file_format" ,
305
+ field_type = StringType (),
306
+ required = True ,
307
+ doc = "File format name: avro, orc, or parquet" ,
308
+ ),
309
+ NestedField (
310
+ field_id = 102 ,
311
+ name = "partition" ,
312
+ field_type = StructType (),
313
+ required = True ,
314
+ doc = "Partition data tuple, schema based on the partition spec" ,
315
+ ),
316
+ NestedField (field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file" ),
317
+ NestedField (
318
+ field_id = 104 , name = "file_size_in_bytes" , field_type = LongType (), required = True , doc = "Total file size in bytes"
319
+ ),
320
+ NestedField (
321
+ field_id = 108 ,
322
+ name = "column_sizes" ,
323
+ field_type = MapType (key_id = 117 , key_type = IntegerType (), value_id = 118 , value_type = LongType ()),
324
+ required = False ,
325
+ doc = "Map of column id to total size on disk" ,
326
+ ),
327
+ NestedField (
328
+ field_id = 109 ,
329
+ name = "value_counts" ,
330
+ field_type = MapType (key_id = 119 , key_type = IntegerType (), value_id = 120 , value_type = LongType ()),
331
+ required = False ,
332
+ doc = "Map of column id to total count, including null and NaN" ,
333
+ ),
334
+ NestedField (
335
+ field_id = 110 ,
336
+ name = "null_value_counts" ,
337
+ field_type = MapType (key_id = 121 , key_type = IntegerType (), value_id = 122 , value_type = LongType ()),
338
+ required = False ,
339
+ doc = "Map of column id to null value count" ,
340
+ ),
341
+ NestedField (
342
+ field_id = 137 ,
343
+ name = "nan_value_counts" ,
344
+ field_type = MapType (key_id = 138 , key_type = IntegerType (), value_id = 139 , value_type = LongType ()),
345
+ required = False ,
346
+ doc = "Map of column id to number of NaN values in the column" ,
347
+ ),
348
+ NestedField (
349
+ field_id = 125 ,
350
+ name = "lower_bounds" ,
351
+ field_type = MapType (key_id = 126 , key_type = IntegerType (), value_id = 127 , value_type = BinaryType ()),
352
+ required = False ,
353
+ doc = "Map of column id to lower bound" ,
354
+ ),
355
+ NestedField (
356
+ field_id = 128 ,
357
+ name = "upper_bounds" ,
358
+ field_type = MapType (key_id = 129 , key_type = IntegerType (), value_id = 130 , value_type = BinaryType ()),
359
+ required = False ,
360
+ doc = "Map of column id to upper bound" ,
361
+ ),
362
+ NestedField (
363
+ field_id = 131 , name = "key_metadata" , field_type = BinaryType (), required = False , doc = "Encryption key metadata blob"
364
+ ),
365
+ NestedField (
366
+ field_id = 132 ,
367
+ name = "split_offsets" ,
368
+ field_type = ListType (element_id = 133 , element_type = LongType (), element_required = True ),
369
+ required = False ,
370
+ doc = "Splittable offsets" ,
371
+ ),
372
+ NestedField (
373
+ field_id = 135 ,
374
+ name = "equality_ids" ,
375
+ field_type = ListType (element_id = 136 , element_type = LongType (), element_required = True ),
376
+ required = False ,
377
+ doc = "Field ids used to determine row equality in equality delete files." ,
378
+ ),
379
+ NestedField (
380
+ field_id = 140 ,
381
+ name = "sort_order_id" ,
382
+ field_type = IntegerType (),
383
+ required = False ,
384
+ doc = "ID representing sort order for this file" ,
385
+ ),
386
+ NestedField (
387
+ field_id = 142 ,
388
+ name = "first_row_id" ,
389
+ field_type = LongType (),
390
+ required = False ,
391
+ doc = "The _row_id for the first row in the data file." ,
392
+ ),
393
+ NestedField (
394
+ field_id = 143 ,
395
+ name = "referenced_data_file" ,
396
+ field_type = StringType (),
397
+ required = False ,
398
+ doc = "Fully qualified location (URI with FS scheme) of a data file that all deletes reference" ,
399
+ ),
400
+ NestedField (
401
+ field_id = 144 ,
402
+ name = "content_offset" ,
403
+ field_type = LongType (),
404
+ required = False ,
405
+ doc = "The offset in the file where the content starts." ,
406
+ ),
407
+ NestedField (
408
+ field_id = 145 ,
409
+ name = "content_size_in_bytes" ,
410
+ field_type = LongType (),
411
+ required = False ,
412
+ doc = "The length of a referenced content stored in the file; required if content_offset is present" ,
413
+ ),
414
+ ),
292
415
}
293
416
294
417
@@ -434,6 +557,13 @@ def __eq__(self, other: Any) -> bool:
434
557
NestedField (4 , "file_sequence_number" , LongType (), required = False ),
435
558
NestedField (2 , "data_file" , DATA_FILE_TYPE [2 ], required = True ),
436
559
),
560
+ 3 : Schema (
561
+ NestedField (0 , "status" , IntegerType (), required = True ),
562
+ NestedField (1 , "snapshot_id" , LongType (), required = False ),
563
+ NestedField (3 , "sequence_number" , LongType (), required = False ),
564
+ NestedField (4 , "file_sequence_number" , LongType (), required = False ),
565
+ NestedField (2 , "data_file" , DATA_FILE_TYPE [3 ], required = True ),
566
+ ),
437
567
}
438
568
439
569
MANIFEST_ENTRY_SCHEMAS_STRUCT = {format_version : schema .as_struct () for format_version , schema in MANIFEST_ENTRY_SCHEMAS .items ()}
@@ -604,6 +734,24 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition
604
734
NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
605
735
NestedField (519 , "key_metadata" , BinaryType (), required = False ),
606
736
),
737
+ 3 : Schema (
738
+ NestedField (500 , "manifest_path" , StringType (), required = True , doc = "Location URI with FS scheme" ),
739
+ NestedField (501 , "manifest_length" , LongType (), required = True ),
740
+ NestedField (502 , "partition_spec_id" , IntegerType (), required = True ),
741
+ NestedField (517 , "content" , IntegerType (), required = True , initial_default = ManifestContent .DATA ),
742
+ NestedField (515 , "sequence_number" , LongType (), required = True , initial_default = 0 ),
743
+ NestedField (516 , "min_sequence_number" , LongType (), required = True , initial_default = 0 ),
744
+ NestedField (503 , "added_snapshot_id" , LongType (), required = True ),
745
+ NestedField (504 , "added_files_count" , IntegerType (), required = True ),
746
+ NestedField (505 , "existing_files_count" , IntegerType (), required = True ),
747
+ NestedField (506 , "deleted_files_count" , IntegerType (), required = True ),
748
+ NestedField (512 , "added_rows_count" , LongType (), required = True ),
749
+ NestedField (513 , "existing_rows_count" , LongType (), required = True ),
750
+ NestedField (514 , "deleted_rows_count" , LongType (), required = True ),
751
+ NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
752
+ NestedField (519 , "key_metadata" , BinaryType (), required = False ),
753
+ NestedField (520 , "first_row_id" , LongType (), required = False ),
754
+ ),
607
755
}
608
756
609
757
MANIFEST_LIST_FILE_STRUCTS = {format_version : schema .as_struct () for format_version , schema in MANIFEST_LIST_FILE_SCHEMAS .items ()}
0 commit comments