Skip to content

Commit 51de85a

Browse files
gtretteneroGiorgio Trettenerokevinjqliu
authored andcommitted
add PARTITION_SUMMARY_PROP (apache#2202)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Adding the snapshot property to keep parity with [Java](https://github.com/apache/iceberg/commits/main/core/src/main/java/org/apache/iceberg/SnapshotSummary.java) # Are these changes tested? Yes I modified two of the existing tests to add this property and added an additional two for cases where the table is unchanged or unpartitioned. # Are there any user-facing changes? Yes, this will add a property to the Snapshot Summaries when partitions are changed. <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Giorgio Trettenero <[email protected]> Co-authored-by: Kevin Liu <[email protected]>
1 parent ad07b6c commit 51de85a

File tree

2 files changed

+25
-0
lines changed

2 files changed

+25
-0
lines changed

pyiceberg/table/snapshots.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
TOTAL_FILE_SIZE = "total-files-size"
5959
CHANGED_PARTITION_COUNT_PROP = "changed-partition-count"
6060
CHANGED_PARTITION_PREFIX = "partitions."
61+
PARTITION_SUMMARY_PROP = "partition-summaries-included"
6162
OPERATION = "operation"
6263

6364
INITIAL_SEQUENCE_NUMBER = 0
@@ -306,6 +307,8 @@ def build(self) -> Dict[str, str]:
306307
changed_partitions_size = len(self.partition_metrics)
307308
set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP)
308309
if changed_partitions_size <= self.max_changed_partitions_for_summaries:
310+
if changed_partitions_size > 0:
311+
properties[PARTITION_SUMMARY_PROP] = "true"
309312
for partition_path, update_metrics_partition in self.partition_metrics.items():
310313
if (summary := self._partition_summary(update_metrics_partition)) and len(summary) != 0:
311314
properties[CHANGED_PARTITION_PREFIX + partition_path] = summary

tests/table/test_snapshots.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ def test_snapshot_summary_collector_with_partition() -> None:
224224
"added-records": "100",
225225
"deleted-records": "300",
226226
"changed-partition-count": "2",
227+
"partition-summaries-included": "true",
227228
"partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
228229
"partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
229230
}
@@ -259,11 +260,32 @@ def test_snapshot_summary_collector_with_partition_limit_in_constructor() -> Non
259260
"added-records": "100",
260261
"deleted-records": "300",
261262
"changed-partition-count": "2",
263+
"partition-summaries-included": "true",
262264
"partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
263265
"partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
264266
}
265267

266268

269+
@pytest.mark.integration
270+
def test_partition_summaries_included_not_set_when_no_change() -> None:
271+
ssc = SnapshotSummaryCollector()
272+
# No files added, so no partition_metrics
273+
ssc.set_partition_summary_limit(10)
274+
result = ssc.build()
275+
assert "partition-summaries-included" not in result
276+
assert result == {} # Should be empty dict
277+
278+
279+
@pytest.mark.integration
280+
def test_partition_summaries_included_not_set_when_unpartitioned_files(table_schema_simple: Schema) -> None:
281+
ssc = SnapshotSummaryCollector()
282+
data_file = DataFile.from_args(content=DataFileContent.DATA, record_count=100, file_size_in_bytes=1234, partition=Record())
283+
ssc.add_file(data_file, schema=table_schema_simple)
284+
ssc.set_partition_summary_limit(10)
285+
result = ssc.build()
286+
assert "partition-summaries-included" not in result
287+
288+
267289
def test_merge_snapshot_summaries_empty() -> None:
268290
assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary(
269291
operation=Operation.APPEND,

0 commit comments

Comments
 (0)