Skip to content

Commit ea57cbb

Browse files
authored
Add v3 structs (apache#2065)
Closes apache#1982 # Rationale for this change Adds v3 structs for data files, manifest entries, and manifest lists. Necessary for related v3 work. # Are these changes tested? No, these are just schemas. # Are there any user-facing changes? No
1 parent 47613d3 commit ea57cbb

File tree

1 file changed

+148
-0
lines changed

1 file changed

+148
-0
lines changed

pyiceberg/manifest.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,129 @@ def __repr__(self) -> str:
289289
doc="ID representing sort order for this file",
290290
),
291291
),
292+
3: StructType(
293+
NestedField(
294+
field_id=134,
295+
name="content",
296+
field_type=IntegerType(),
297+
required=True,
298+
doc="File format name: avro, orc, or parquet",
299+
initial_default=DataFileContent.DATA,
300+
),
301+
NestedField(field_id=100, name="file_path", field_type=StringType(), required=True, doc="Location URI with FS scheme"),
302+
NestedField(
303+
field_id=101,
304+
name="file_format",
305+
field_type=StringType(),
306+
required=True,
307+
doc="File format name: avro, orc, or parquet",
308+
),
309+
NestedField(
310+
field_id=102,
311+
name="partition",
312+
field_type=StructType(),
313+
required=True,
314+
doc="Partition data tuple, schema based on the partition spec",
315+
),
316+
NestedField(field_id=103, name="record_count", field_type=LongType(), required=True, doc="Number of records in the file"),
317+
NestedField(
318+
field_id=104, name="file_size_in_bytes", field_type=LongType(), required=True, doc="Total file size in bytes"
319+
),
320+
NestedField(
321+
field_id=108,
322+
name="column_sizes",
323+
field_type=MapType(key_id=117, key_type=IntegerType(), value_id=118, value_type=LongType()),
324+
required=False,
325+
doc="Map of column id to total size on disk",
326+
),
327+
NestedField(
328+
field_id=109,
329+
name="value_counts",
330+
field_type=MapType(key_id=119, key_type=IntegerType(), value_id=120, value_type=LongType()),
331+
required=False,
332+
doc="Map of column id to total count, including null and NaN",
333+
),
334+
NestedField(
335+
field_id=110,
336+
name="null_value_counts",
337+
field_type=MapType(key_id=121, key_type=IntegerType(), value_id=122, value_type=LongType()),
338+
required=False,
339+
doc="Map of column id to null value count",
340+
),
341+
NestedField(
342+
field_id=137,
343+
name="nan_value_counts",
344+
field_type=MapType(key_id=138, key_type=IntegerType(), value_id=139, value_type=LongType()),
345+
required=False,
346+
doc="Map of column id to number of NaN values in the column",
347+
),
348+
NestedField(
349+
field_id=125,
350+
name="lower_bounds",
351+
field_type=MapType(key_id=126, key_type=IntegerType(), value_id=127, value_type=BinaryType()),
352+
required=False,
353+
doc="Map of column id to lower bound",
354+
),
355+
NestedField(
356+
field_id=128,
357+
name="upper_bounds",
358+
field_type=MapType(key_id=129, key_type=IntegerType(), value_id=130, value_type=BinaryType()),
359+
required=False,
360+
doc="Map of column id to upper bound",
361+
),
362+
NestedField(
363+
field_id=131, name="key_metadata", field_type=BinaryType(), required=False, doc="Encryption key metadata blob"
364+
),
365+
NestedField(
366+
field_id=132,
367+
name="split_offsets",
368+
field_type=ListType(element_id=133, element_type=LongType(), element_required=True),
369+
required=False,
370+
doc="Splittable offsets",
371+
),
372+
NestedField(
373+
field_id=135,
374+
name="equality_ids",
375+
field_type=ListType(element_id=136, element_type=LongType(), element_required=True),
376+
required=False,
377+
doc="Field ids used to determine row equality in equality delete files.",
378+
),
379+
NestedField(
380+
field_id=140,
381+
name="sort_order_id",
382+
field_type=IntegerType(),
383+
required=False,
384+
doc="ID representing sort order for this file",
385+
),
386+
NestedField(
387+
field_id=142,
388+
name="first_row_id",
389+
field_type=LongType(),
390+
required=False,
391+
doc="The _row_id for the first row in the data file.",
392+
),
393+
NestedField(
394+
field_id=143,
395+
name="referenced_data_file",
396+
field_type=StringType(),
397+
required=False,
398+
doc="Fully qualified location (URI with FS scheme) of a data file that all deletes reference",
399+
),
400+
NestedField(
401+
field_id=144,
402+
name="content_offset",
403+
field_type=LongType(),
404+
required=False,
405+
doc="The offset in the file where the content starts.",
406+
),
407+
NestedField(
408+
field_id=145,
409+
name="content_size_in_bytes",
410+
field_type=LongType(),
411+
required=False,
412+
doc="The length of a referenced content stored in the file; required if content_offset is present",
413+
),
414+
),
292415
}
293416

294417

@@ -434,6 +557,13 @@ def __eq__(self, other: Any) -> bool:
434557
NestedField(4, "file_sequence_number", LongType(), required=False),
435558
NestedField(2, "data_file", DATA_FILE_TYPE[2], required=True),
436559
),
560+
3: Schema(
561+
NestedField(0, "status", IntegerType(), required=True),
562+
NestedField(1, "snapshot_id", LongType(), required=False),
563+
NestedField(3, "sequence_number", LongType(), required=False),
564+
NestedField(4, "file_sequence_number", LongType(), required=False),
565+
NestedField(2, "data_file", DATA_FILE_TYPE[3], required=True),
566+
),
437567
}
438568

439569
MANIFEST_ENTRY_SCHEMAS_STRUCT = {format_version: schema.as_struct() for format_version, schema in MANIFEST_ENTRY_SCHEMAS.items()}
@@ -604,6 +734,24 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition
604734
NestedField(507, "partitions", ListType(508, PARTITION_FIELD_SUMMARY_TYPE, element_required=True), required=False),
605735
NestedField(519, "key_metadata", BinaryType(), required=False),
606736
),
737+
3: Schema(
738+
NestedField(500, "manifest_path", StringType(), required=True, doc="Location URI with FS scheme"),
739+
NestedField(501, "manifest_length", LongType(), required=True),
740+
NestedField(502, "partition_spec_id", IntegerType(), required=True),
741+
NestedField(517, "content", IntegerType(), required=True, initial_default=ManifestContent.DATA),
742+
NestedField(515, "sequence_number", LongType(), required=True, initial_default=0),
743+
NestedField(516, "min_sequence_number", LongType(), required=True, initial_default=0),
744+
NestedField(503, "added_snapshot_id", LongType(), required=True),
745+
NestedField(504, "added_files_count", IntegerType(), required=True),
746+
NestedField(505, "existing_files_count", IntegerType(), required=True),
747+
NestedField(506, "deleted_files_count", IntegerType(), required=True),
748+
NestedField(512, "added_rows_count", LongType(), required=True),
749+
NestedField(513, "existing_rows_count", LongType(), required=True),
750+
NestedField(514, "deleted_rows_count", LongType(), required=True),
751+
NestedField(507, "partitions", ListType(508, PARTITION_FIELD_SUMMARY_TYPE, element_required=True), required=False),
752+
NestedField(519, "key_metadata", BinaryType(), required=False),
753+
NestedField(520, "first_row_id", LongType(), required=False),
754+
),
607755
}
608756

609757
MANIFEST_LIST_FILE_STRUCTS = {format_version: schema.as_struct() for format_version, schema in MANIFEST_LIST_FILE_SCHEMAS.items()}

0 commit comments

Comments
 (0)