|
15 | 15 | # specific language governing permissions and limitations
|
16 | 16 | # under the License.
|
17 | 17 | # pylint:disable=redefined-outer-name
|
| 18 | +import math |
18 | 19 | import os
|
19 | 20 | import time
|
20 | 21 | import uuid
|
21 | 22 | from datetime import date, datetime
|
22 | 23 | from pathlib import Path
|
23 |
| -from typing import Any, Dict, List |
| 24 | +from typing import Any, Dict, List, Optional |
24 | 25 | from urllib.parse import urlparse
|
25 | 26 |
|
26 | 27 | import pyarrow as pa
|
@@ -135,15 +136,19 @@ def arrow_table_with_only_nulls(pa_schema: pa.Schema) -> pa.Table:
|
135 | 136 | return pa.Table.from_pylist([{}, {}], schema=pa_schema)
|
136 | 137 |
|
137 | 138 |
|
138 |
| -def _create_table(session_catalog: Catalog, identifier: str, properties: Properties, data: List[pa.Table]) -> Table: |
| 139 | +def _create_table( |
| 140 | + session_catalog: Catalog, identifier: str, properties: Properties, data: Optional[List[pa.Table]] = None |
| 141 | +) -> Table: |
139 | 142 | try:
|
140 | 143 | session_catalog.drop_table(identifier=identifier)
|
141 | 144 | except NoSuchTableError:
|
142 | 145 | pass
|
143 | 146 |
|
144 | 147 | tbl = session_catalog.create_table(identifier=identifier, schema=TABLE_SCHEMA, properties=properties)
|
145 |
| - for d in data: |
146 |
| - tbl.append(d) |
| 148 | + |
| 149 | + if data: |
| 150 | + for d in data: |
| 151 | + tbl.append(d) |
147 | 152 |
|
148 | 153 | return tbl
|
149 | 154 |
|
@@ -667,3 +672,69 @@ def test_table_properties_raise_for_none_value(
|
667 | 672 | session_catalog, identifier, {"format-version": format_version, **property_with_none}, [arrow_table_with_null]
|
668 | 673 | )
|
669 | 674 | assert "None type is not a supported value in properties: property_name" in str(exc_info.value)
|
| 675 | + |
| 676 | + |
| 677 | +@pytest.mark.integration |
| 678 | +@pytest.mark.parametrize("format_version", [1, 2]) |
| 679 | +def test_inspect_snapshots( |
| 680 | + spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int |
| 681 | +) -> None: |
| 682 | + identifier = "default.table_metadata_snapshots" |
| 683 | + tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) |
| 684 | + |
| 685 | + tbl.overwrite(arrow_table_with_null) |
| 686 | + # should produce a DELETE entry |
| 687 | + tbl.overwrite(arrow_table_with_null) |
| 688 | + # Since we don't rewrite, this should produce a new manifest with an ADDED entry |
| 689 | + tbl.append(arrow_table_with_null) |
| 690 | + |
| 691 | + df = tbl.inspect.snapshots() |
| 692 | + |
| 693 | + assert df.column_names == [ |
| 694 | + 'committed_at', |
| 695 | + 'snapshot_id', |
| 696 | + 'parent_id', |
| 697 | + 'operation', |
| 698 | + 'manifest_list', |
| 699 | + 'summary', |
| 700 | + ] |
| 701 | + |
| 702 | + for committed_at in df['committed_at']: |
| 703 | + assert isinstance(committed_at.as_py(), datetime) |
| 704 | + |
| 705 | + for snapshot_id in df['snapshot_id']: |
| 706 | + assert isinstance(snapshot_id.as_py(), int) |
| 707 | + |
| 708 | + assert df['parent_id'][0].as_py() is None |
| 709 | + assert df['parent_id'][1:] == df['snapshot_id'][:2] |
| 710 | + |
| 711 | + assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append'] |
| 712 | + |
| 713 | + for manifest_list in df['manifest_list']: |
| 714 | + assert manifest_list.as_py().startswith("s3://") |
| 715 | + |
| 716 | + assert df['summary'][0].as_py() == [ |
| 717 | + ('added-files-size', '5459'), |
| 718 | + ('added-data-files', '1'), |
| 719 | + ('added-records', '3'), |
| 720 | + ('total-data-files', '1'), |
| 721 | + ('total-delete-files', '0'), |
| 722 | + ('total-records', '3'), |
| 723 | + ('total-files-size', '5459'), |
| 724 | + ('total-position-deletes', '0'), |
| 725 | + ('total-equality-deletes', '0'), |
| 726 | + ] |
| 727 | + |
| 728 | + lhs = spark.table(f"{identifier}.snapshots").toPandas() |
| 729 | + rhs = df.to_pandas() |
| 730 | + for column in df.column_names: |
| 731 | + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): |
| 732 | + if column == 'summary': |
| 733 | + # Arrow returns a list of tuples, instead of a dict |
| 734 | + right = dict(right) |
| 735 | + |
| 736 | + if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right): |
| 737 | + # NaN != NaN in Python |
| 738 | + continue |
| 739 | + |
| 740 | + assert left == right, f"Difference in column {column}: {left} != {right}" |
0 commit comments