@@ -185,10 +185,11 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
185
185
).collect ()
186
186
187
187
operations = [row .operation for row in rows ]
188
- assert operations == ['append' , 'append' , 'overwrite' ]
188
+ assert operations == ['append' , 'append' , 'delete' , ' overwrite' ]
189
189
190
190
summaries = [row .summary for row in rows ]
191
191
192
+ # Append
192
193
assert summaries [0 ] == {
193
194
'added-data-files' : '1' ,
194
195
'added-files-size' : '5459' ,
@@ -201,6 +202,7 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
201
202
'total-records' : '3' ,
202
203
}
203
204
205
+ # Append
204
206
assert summaries [1 ] == {
205
207
'added-data-files' : '1' ,
206
208
'added-files-size' : '5459' ,
@@ -213,13 +215,24 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
213
215
'total-records' : '6' ,
214
216
}
215
217
218
+ # Delete
216
219
assert summaries [2 ] == {
217
- 'added-data-files' : '1' ,
218
- 'added-files-size' : '5459' ,
219
- 'added-records' : '3' ,
220
220
'deleted-data-files' : '2' ,
221
221
'deleted-records' : '6' ,
222
222
'removed-files-size' : '10918' ,
223
+ 'total-data-files' : '0' ,
224
+ 'total-delete-files' : '0' ,
225
+ 'total-equality-deletes' : '0' ,
226
+ 'total-files-size' : '0' ,
227
+ 'total-position-deletes' : '0' ,
228
+ 'total-records' : '0' ,
229
+ }
230
+
231
+ # Overwrite
232
+ assert summaries [3 ] == {
233
+ 'added-data-files' : '1' ,
234
+ 'added-files-size' : '5459' ,
235
+ 'added-records' : '3' ,
223
236
'total-data-files' : '1' ,
224
237
'total-delete-files' : '0' ,
225
238
'total-equality-deletes' : '0' ,
@@ -247,9 +260,9 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
247
260
"""
248
261
).collect ()
249
262
250
- assert [row .added_data_files_count for row in rows ] == [1 , 1 , 0 , 1 , 1 ]
251
- assert [row .existing_data_files_count for row in rows ] == [0 , 0 , 0 , 0 , 0 ]
252
- assert [row .deleted_data_files_count for row in rows ] == [0 , 0 , 1 , 0 , 0 ]
263
+ assert [row .added_data_files_count for row in rows ] == [1 , 0 , 1 , 0 , 1 , 1 ]
264
+ assert [row .existing_data_files_count for row in rows ] == [0 , 0 , 0 , 0 , 0 , 0 ]
265
+ assert [row .deleted_data_files_count for row in rows ] == [0 , 1 , 0 , 1 , 0 , 0 ]
253
266
254
267
255
268
@pytest .mark .integration
@@ -476,7 +489,7 @@ def test_summaries_with_only_nulls(
476
489
).collect ()
477
490
478
491
operations = [row .operation for row in rows ]
479
- assert operations == ['append' , 'append' , 'overwrite' ]
492
+ assert operations == ['append' , 'append' , 'delete' , ' overwrite' ]
480
493
481
494
summaries = [row .summary for row in rows ]
482
495
@@ -502,14 +515,23 @@ def test_summaries_with_only_nulls(
502
515
}
503
516
504
517
assert summaries [2 ] == {
518
+ 'deleted-data-files' : '1' ,
519
+ 'deleted-records' : '2' ,
505
520
'removed-files-size' : '4239' ,
521
+ 'total-data-files' : '0' ,
522
+ 'total-delete-files' : '0' ,
506
523
'total-equality-deletes' : '0' ,
524
+ 'total-files-size' : '0' ,
507
525
'total-position-deletes' : '0' ,
508
- 'deleted-data-files' : '1' ,
526
+ 'total-records' : '0' ,
527
+ }
528
+
529
+ assert summaries [3 ] == {
530
+ 'total-data-files' : '0' ,
509
531
'total-delete-files' : '0' ,
532
+ 'total-equality-deletes' : '0' ,
510
533
'total-files-size' : '0' ,
511
- 'deleted-records' : '2' ,
512
- 'total-data-files' : '0' ,
534
+ 'total-position-deletes' : '0' ,
513
535
'total-records' : '0' ,
514
536
}
515
537
@@ -731,13 +753,14 @@ def test_inspect_snapshots(
731
753
assert isinstance (snapshot_id .as_py (), int )
732
754
733
755
assert df ['parent_id' ][0 ].as_py () is None
734
- assert df ['parent_id' ][1 :] == df ['snapshot_id' ][:2 ]
756
+ assert df ['parent_id' ][1 :]. to_pylist () == df ['snapshot_id' ][:- 1 ]. to_pylist ()
735
757
736
- assert [operation .as_py () for operation in df ['operation' ]] == ['append' , 'overwrite' , 'append' ]
758
+ assert [operation .as_py () for operation in df ['operation' ]] == ['append' , 'delete' , ' overwrite' , 'append' ]
737
759
738
760
for manifest_list in df ['manifest_list' ]:
739
761
assert manifest_list .as_py ().startswith ("s3://" )
740
762
763
+ # Append
741
764
assert df ['summary' ][0 ].as_py () == [
742
765
('added-files-size' , '5459' ),
743
766
('added-data-files' , '1' ),
@@ -750,6 +773,19 @@ def test_inspect_snapshots(
750
773
('total-equality-deletes' , '0' ),
751
774
]
752
775
776
+ # Delete
777
+ assert df ['summary' ][1 ].as_py () == [
778
+ ('removed-files-size' , '5459' ),
779
+ ('deleted-data-files' , '1' ),
780
+ ('deleted-records' , '3' ),
781
+ ('total-data-files' , '0' ),
782
+ ('total-delete-files' , '0' ),
783
+ ('total-records' , '0' ),
784
+ ('total-files-size' , '0' ),
785
+ ('total-position-deletes' , '0' ),
786
+ ('total-equality-deletes' , '0' ),
787
+ ]
788
+
753
789
lhs = spark .table (f"{ identifier } .snapshots" ).toPandas ()
754
790
rhs = df .to_pandas ()
755
791
for column in df .column_names :
0 commit comments