apache · HonahX · Jun 26, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -656,6 +656,29 @@ partition_summaries: [[    -- is_valid: all not null
 ["test"]]]
 ```
 
+### Metadata Log Entries
+
+To show table metadata log entries:
+
+```python
+table.inspect.metadata_log_entries()
+```
+
+```
+pyarrow.Table
+timestamp: timestamp[ms] not null
+file: string not null
+latest_snapshot_id: int64
+latest_schema_id: int32
+latest_sequence_number: int64
+----
+timestamp: [[2024-04-28 17:03:00.214,2024-04-28 17:03:00.352,2024-04-28 17:03:00.445,2024-04-28 17:03:00.498]]
+file: [["s3://warehouse/default/table_metadata_log_entries/metadata/00000-0b3b643b-0f3a-4787-83ad-601ba57b7319.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00001-f74e4b2c-0f89-4f55-822d-23d099fd7d54.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00002-97e31507-e4d9-4438-aff1-3c0c5304d271.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00003-6c8b7033-6ad8-4fe4-b64d-d70381aeaddc.metadata.json"]]
+latest_snapshot_id: [[null,3958871664825505738,1289234307021405706,7640277914614648349]]
+latest_schema_id: [[null,0,0,0]]
+latest_sequence_number: [[null,0,0,0]]
+```
+
 ## Add Files
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -3827,6 +3827,40 @@ def _partition_summaries_to_rows(
             schema=manifest_schema,
         )
 
+    def metadata_log_entries(self) -> "pa.Table":
+        import pyarrow as pa
+
+        from pyiceberg.table.snapshots import MetadataLogEntry
+
+        table_schema = pa.schema([
+            pa.field("timestamp", pa.timestamp(unit="ms"), nullable=False),
+            pa.field("file", pa.string(), nullable=False),
+            pa.field("latest_snapshot_id", pa.int64(), nullable=True),
+            pa.field("latest_schema_id", pa.int32(), nullable=True),
+            pa.field("latest_sequence_number", pa.int64(), nullable=True),
+        ])
+
+        def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any]:
+            latest_snapshot = self.tbl.snapshot_as_of_timestamp(metadata_entry.timestamp_ms)
+            return {
+                "timestamp": metadata_entry.timestamp_ms,
+                "file": metadata_entry.metadata_file,
+                "latest_snapshot_id": latest_snapshot.snapshot_id if latest_snapshot else None,
+                "latest_schema_id": latest_snapshot.schema_id if latest_snapshot else None,
+                "latest_sequence_number": latest_snapshot.sequence_number if latest_snapshot else None,
+            }
+
+        # imitates `addPreviousFile` from Java
+        # https://github.com/apache/iceberg/blob/8248663a2a1ffddd2664ea37b45882455466f71c/core/src/main/java/org/apache/iceberg/TableMetadata.java#L1450-L1451
+        metadata_log_entries = self.tbl.metadata.metadata_log + [
+            MetadataLogEntry(metadata_file=self.tbl.metadata_location, timestamp_ms=self.tbl.metadata.last_updated_ms)
+        ]
+
+        return pa.Table.from_pylist(
+            [metadata_log_entry_to_row(entry) for entry in metadata_log_entries],
+            schema=table_schema,
+        )
+
 
 @dataclass(frozen=True)
 class TablePartition:

diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -58,6 +58,8 @@
 CHANGED_PARTITION_PREFIX = "partitions."
 OPERATION = "operation"
 
+INITIAL_SEQUENCE_NUMBER = 0
+
 
 class Operation(Enum):
     """Describes the operation.
@@ -231,7 +233,7 @@ def __eq__(self, other: Any) -> bool:
 class Snapshot(IcebergBaseModel):
     snapshot_id: int = Field(alias="snapshot-id")
     parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None)
-    sequence_number: Optional[int] = Field(alias="sequence-number", default=None)
+    sequence_number: Optional[int] = Field(alias="sequence-number", default=INITIAL_SEQUENCE_NUMBER)
     timestamp_ms: int = Field(alias="timestamp-ms", default_factory=lambda: int(time.time() * 1000))
     manifest_list: Optional[str] = Field(
         alias="manifest-list", description="Location of the snapshot's manifest list file", default=None

diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py
@@ -528,3 +528,43 @@ def test_inspect_manifests(spark: SparkSession, session_catalog: Catalog, format
     for column in df.column_names:
         for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
             assert left == right, f"Difference in column {column}: {left} != {right}"
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_metadata_log_entries(
+    spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
+) -> None:
+    from pandas.testing import assert_frame_equal
+
+    identifier = "default.table_metadata_log_entries"
+    try:
+        session_catalog.drop_table(identifier=identifier)
+    except NoSuchTableError:
+        pass
+
+    tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})
+
+    # Write some data
+    tbl.append(arrow_table_with_null)
+    tbl.append(arrow_table_with_null)
+    tbl.append(arrow_table_with_null)
+
+    df = tbl.inspect.metadata_log_entries()
+    spark_df = spark.sql(f"SELECT * FROM {identifier}.metadata_log_entries")
+    lhs = df.to_pandas()
+    rhs = spark_df.toPandas()
+
+    # Timestamp in the last row of `metadata_log_entries` table is based on when the table was read
+    # Therefore, the timestamp of the last row for pyiceberg dataframe and spark dataframe will be different
+    left_before_last, left_last = lhs[:-1], lhs[-1:]
+    right_before_last, right_last = rhs[:-1], rhs[-1:]
+
+    # compare all rows except for the last row
+    assert_frame_equal(left_before_last, right_before_last, check_dtype=False)
+    # compare the last row, except for the timestamp
+    for column in df.column_names:
+        for left, right in zip(left_last[column], right_last[column]):
+            if column == "timestamp":
+                continue
+            assert left == right, f"Difference in column {column}: {left} != {right}"
diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py
@@ -168,7 +168,7 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None:
 def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None:
     table_metadata = TableMetadataV1(**example_table_metadata_v1)
     table_metadata_json = table_metadata.model_dump_json()
-    expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}"""
+    expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"sequence-number":0,"timestamp-ms":1602638573822}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}"""
     assert table_metadata_json == expected
 
 

diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py
@@ -83,7 +83,7 @@ def test_serialize_snapshot_without_sequence_number() -> None:
         schema_id=3,
     )
     actual = snapshot.model_dump_json()
-    expected = """{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}"""
+    expected = """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":0,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}"""
     assert actual == expected