Skip to content

Commit 254a701

Browse files
authored
Add Files metadata table (#614)
1 parent adf8163 commit 254a701

File tree

3 files changed

+404
-0
lines changed

3 files changed

+404
-0
lines changed

mkdocs/docs/api.md

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,137 @@ parent_id: [[null,4358109269873137077,null,4358109269873137077]]
700700
is_current_ancestor: [[true,false,true,true]]
701701
```
702702

703+
### Files
704+
705+
Inspect the data files in the current snapshot of the table:
706+
707+
```python
708+
table.inspect.files()
709+
```
710+
711+
```
712+
pyarrow.Table
713+
content: int8 not null
714+
file_path: string not null
715+
file_format: dictionary<values=string, indices=int32, ordered=0> not null
716+
spec_id: int32 not null
717+
record_count: int64 not null
718+
file_size_in_bytes: int64 not null
719+
column_sizes: map<int32, int64>
720+
child 0, entries: struct<key: int32 not null, value: int64> not null
721+
child 0, key: int32 not null
722+
child 1, value: int64
723+
value_counts: map<int32, int64>
724+
child 0, entries: struct<key: int32 not null, value: int64> not null
725+
child 0, key: int32 not null
726+
child 1, value: int64
727+
null_value_counts: map<int32, int64>
728+
child 0, entries: struct<key: int32 not null, value: int64> not null
729+
child 0, key: int32 not null
730+
child 1, value: int64
731+
nan_value_counts: map<int32, int64>
732+
child 0, entries: struct<key: int32 not null, value: int64> not null
733+
child 0, key: int32 not null
734+
child 1, value: int64
735+
lower_bounds: map<int32, binary>
736+
child 0, entries: struct<key: int32 not null, value: binary> not null
737+
child 0, key: int32 not null
738+
child 1, value: binary
739+
upper_bounds: map<int32, binary>
740+
child 0, entries: struct<key: int32 not null, value: binary> not null
741+
child 0, key: int32 not null
742+
child 1, value: binary
743+
key_metadata: binary
744+
split_offsets: list<item: int64>
745+
child 0, item: int64
746+
equality_ids: list<item: int32>
747+
child 0, item: int32
748+
sort_order_id: int32
749+
readable_metrics: struct<city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: large_string, upper_bound: large_string> not null, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null>
750+
child 0, city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null
751+
child 0, column_size: int64
752+
child 1, value_count: int64
753+
child 2, null_value_count: int64
754+
child 3, nan_value_count: int64
755+
child 4, lower_bound: large_string
756+
child 5, upper_bound: large_string
757+
child 1, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
758+
child 0, column_size: int64
759+
child 1, value_count: int64
760+
child 2, null_value_count: int64
761+
child 3, nan_value_count: int64
762+
child 4, lower_bound: double
763+
child 5, upper_bound: double
764+
child 2, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
765+
child 0, column_size: int64
766+
child 1, value_count: int64
767+
child 2, null_value_count: int64
768+
child 3, nan_value_count: int64
769+
child 4, lower_bound: double
770+
child 5, upper_bound: double
771+
----
772+
content: [[0,0]]
773+
file_path: [["s3://warehouse/default/table_metadata_files/data/00000-0-9ea7d222-6457-467f-bad5-6fb125c9aa5f.parquet","s3://warehouse/default/table_metadata_files/data/00000-0-afa8893c-de71-4710-97c9-6b01590d0c44.parquet"]]
774+
file_format: [["PARQUET","PARQUET"]]
775+
spec_id: [[0,0]]
776+
record_count: [[3,3]]
777+
file_size_in_bytes: [[5459,5459]]
778+
column_sizes: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109]]]
779+
value_counts: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[3,3,3,3,3,...,3,3,3,3,3],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[3,3,3,3,3,...,3,3,3,3,3]]]
780+
null_value_counts: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[1,1,1,1,1,...,1,1,1,1,1],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[1,1,1,1,1,...,1,1,1,1,1]]]
781+
nan_value_counts: [[keys:[]values:[],keys:[]values:[]]]
782+
lower_bounds: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000]]]
783+
upper_bounds:[[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000]]]
784+
key_metadata: [[0100,0100]]
785+
split_offsets:[[[],[]]]
786+
equality_ids:[[[],[]]]
787+
sort_order_id:[[[],[]]]
788+
readable_metrics: [
789+
-- is_valid: all not null
790+
-- child 0 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: large_string, upper_bound: large_string>
791+
-- is_valid: all not null
792+
-- child 0 type: int64
793+
[140]
794+
-- child 1 type: int64
795+
[4]
796+
-- child 2 type: int64
797+
[0]
798+
-- child 3 type: int64
799+
[null]
800+
-- child 4 type: large_string
801+
["Amsterdam"]
802+
-- child 5 type: large_string
803+
["San Francisco"]
804+
-- child 1 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
805+
-- is_valid: all not null
806+
-- child 0 type: int64
807+
[135]
808+
-- child 1 type: int64
809+
[4]
810+
-- child 2 type: int64
811+
[0]
812+
-- child 3 type: int64
813+
[null]
814+
-- child 4 type: double
815+
[37.773972]
816+
-- child 5 type: double
817+
[53.11254]
818+
-- child 2 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
819+
-- is_valid: all not null
820+
-- child 0 type: int64
821+
[135]
822+
-- child 1 type: int64
823+
[4]
824+
-- child 2 type: int64
825+
[0]
826+
-- child 3 type: int64
827+
[null]
828+
-- child 4 type: double
829+
[-122.431297]
830+
-- child 5 type: double
831+
[6.0989]]
832+
```
833+
703834
## Add Files
704835

705836
Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.

pyiceberg/table/__init__.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3907,6 +3907,109 @@ def history(self) -> "pa.Table":
39073907

39083908
return pa.Table.from_pylist(history, schema=history_schema)
39093909

3910+
def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
3911+
import pyarrow as pa
3912+
3913+
from pyiceberg.io.pyarrow import schema_to_pyarrow
3914+
3915+
schema = self.tbl.metadata.schema()
3916+
readable_metrics_struct = []
3917+
3918+
def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
3919+
pa_bound_type = schema_to_pyarrow(bound_type)
3920+
return pa.struct([
3921+
pa.field("column_size", pa.int64(), nullable=True),
3922+
pa.field("value_count", pa.int64(), nullable=True),
3923+
pa.field("null_value_count", pa.int64(), nullable=True),
3924+
pa.field("nan_value_count", pa.int64(), nullable=True),
3925+
pa.field("lower_bound", pa_bound_type, nullable=True),
3926+
pa.field("upper_bound", pa_bound_type, nullable=True),
3927+
])
3928+
3929+
for field in self.tbl.metadata.schema().fields:
3930+
readable_metrics_struct.append(
3931+
pa.field(schema.find_column_name(field.field_id), _readable_metrics_struct(field.field_type), nullable=False)
3932+
)
3933+
3934+
files_schema = pa.schema([
3935+
pa.field("content", pa.int8(), nullable=False),
3936+
pa.field("file_path", pa.string(), nullable=False),
3937+
pa.field("file_format", pa.dictionary(pa.int32(), pa.string()), nullable=False),
3938+
pa.field("spec_id", pa.int32(), nullable=False),
3939+
pa.field("record_count", pa.int64(), nullable=False),
3940+
pa.field("file_size_in_bytes", pa.int64(), nullable=False),
3941+
pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True),
3942+
pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True),
3943+
pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True),
3944+
pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True),
3945+
pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True),
3946+
pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True),
3947+
pa.field("key_metadata", pa.binary(), nullable=True),
3948+
pa.field("split_offsets", pa.list_(pa.int64()), nullable=True),
3949+
pa.field("equality_ids", pa.list_(pa.int32()), nullable=True),
3950+
pa.field("sort_order_id", pa.int32(), nullable=True),
3951+
pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True),
3952+
])
3953+
3954+
files: list[dict[str, Any]] = []
3955+
3956+
if not snapshot_id and not self.tbl.metadata.current_snapshot():
3957+
return pa.Table.from_pylist(
3958+
files,
3959+
schema=files_schema,
3960+
)
3961+
snapshot = self._get_snapshot(snapshot_id)
3962+
3963+
io = self.tbl.io
3964+
for manifest_list in snapshot.manifests(io):
3965+
for manifest_entry in manifest_list.fetch_manifest_entry(io):
3966+
data_file = manifest_entry.data_file
3967+
column_sizes = data_file.column_sizes or {}
3968+
value_counts = data_file.value_counts or {}
3969+
null_value_counts = data_file.null_value_counts or {}
3970+
nan_value_counts = data_file.nan_value_counts or {}
3971+
lower_bounds = data_file.lower_bounds or {}
3972+
upper_bounds = data_file.upper_bounds or {}
3973+
readable_metrics = {
3974+
schema.find_column_name(field.field_id): {
3975+
"column_size": column_sizes.get(field.field_id),
3976+
"value_count": value_counts.get(field.field_id),
3977+
"null_value_count": null_value_counts.get(field.field_id),
3978+
"nan_value_count": nan_value_counts.get(field.field_id),
3979+
"lower_bound": from_bytes(field.field_type, lower_bound)
3980+
if (lower_bound := lower_bounds.get(field.field_id))
3981+
else None,
3982+
"upper_bound": from_bytes(field.field_type, upper_bound)
3983+
if (upper_bound := upper_bounds.get(field.field_id))
3984+
else None,
3985+
}
3986+
for field in self.tbl.metadata.schema().fields
3987+
}
3988+
files.append({
3989+
"content": data_file.content,
3990+
"file_path": data_file.file_path,
3991+
"file_format": data_file.file_format,
3992+
"spec_id": data_file.spec_id,
3993+
"record_count": data_file.record_count,
3994+
"file_size_in_bytes": data_file.file_size_in_bytes,
3995+
"column_sizes": dict(data_file.column_sizes),
3996+
"value_counts": dict(data_file.value_counts),
3997+
"null_value_counts": dict(data_file.null_value_counts),
3998+
"nan_value_counts": dict(data_file.nan_value_counts),
3999+
"lower_bounds": dict(data_file.lower_bounds),
4000+
"upper_bounds": dict(data_file.upper_bounds),
4001+
"key_metadata": data_file.key_metadata,
4002+
"split_offsets": data_file.split_offsets,
4003+
"equality_ids": data_file.equality_ids,
4004+
"sort_order_id": data_file.sort_order_id,
4005+
"readable_metrics": readable_metrics,
4006+
})
4007+
4008+
return pa.Table.from_pylist(
4009+
files,
4010+
schema=files_schema,
4011+
)
4012+
39104013

39114014
@dataclass(frozen=True)
39124015
class TablePartition:

0 commit comments

Comments
 (0)