Skip to content

Commit 4cd67ac

Browse files
committed
Conflicts
1 parent 5b10f25 commit 4cd67ac

File tree

4 files changed

+135
-135
lines changed

4 files changed

+135
-135
lines changed

tests/conftest.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,31 +2143,31 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table":
21432143

21442144
return pa.Table.from_pydict(
21452145
{
2146-
'bool': [False, None, True],
2147-
'string': ['a', None, 'z'],
2146+
"bool": [False, None, True],
2147+
"string": ["a", None, "z"],
21482148
# Go over the 16 bytes to kick in truncation
2149-
'string_long': ['a' * 22, None, 'z' * 22],
2150-
'int': [1, None, 9],
2151-
'long': [1, None, 9],
2152-
'float': [0.0, None, 0.9],
2153-
'double': [0.0, None, 0.9],
2149+
"string_long": ["a" * 22, None, "z" * 22],
2150+
"int": [1, None, 9],
2151+
"long": [1, None, 9],
2152+
"float": [0.0, None, 0.9],
2153+
"double": [0.0, None, 0.9],
21542154
# 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields
2155-
'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
2156-
'timestamptz': [
2155+
"timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
2156+
"timestamptz": [
21572157
datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
21582158
None,
21592159
datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
21602160
],
2161-
'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
2161+
"date": [date(2023, 1, 1), None, date(2023, 3, 1)],
21622162
# Not supported by Spark
21632163
# 'time': [time(1, 22, 0), None, time(19, 25, 0)],
21642164
# Not natively supported by Arrow
21652165
# 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
2166-
'binary': [b'\01', None, b'\22'],
2167-
'fixed': [
2168-
uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
2166+
"binary": [b"\01", None, b"\22"],
2167+
"fixed": [
2168+
uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,
21692169
None,
2170-
uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
2170+
uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,
21712171
],
21722172
},
21732173
schema=pa_schema,

tests/integration/test_deletes.py

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None:
3838
@pytest.mark.integration
3939
@pytest.mark.parametrize("format_version", [1, 2])
4040
def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
41-
identifier = 'default.table_partitioned_delete'
41+
identifier = "default.table_partitioned_delete"
4242

4343
run_spark_commands(
4444
spark,
@@ -66,14 +66,14 @@ def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog
6666
tbl.delete(EqualTo("number_partitioned", 10))
6767

6868
# No overwrite operation
69-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'delete']
70-
assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}
69+
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "delete"]
70+
assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 11], "number": [20, 30]}
7171

7272

7373
@pytest.mark.integration
7474
@pytest.mark.parametrize("format_version", [1, 2])
7575
def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
76-
identifier = 'default.table_partitioned_delete'
76+
identifier = "default.table_partitioned_delete"
7777

7878
run_spark_commands(
7979
spark,
@@ -101,14 +101,14 @@ def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCat
101101
tbl.delete(EqualTo("number", 20))
102102

103103
# We don't delete a whole partition, so there is only a overwrite
104-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'overwrite']
105-
assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 10], 'number': [30, 30]}
104+
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "overwrite"]
105+
assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 10], "number": [30, 30]}
106106

107107

108108
@pytest.mark.integration
109109
@pytest.mark.parametrize("format_version", [1, 2])
110110
def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
111-
identifier = 'default.table_partitioned_delete'
111+
identifier = "default.table_partitioned_delete"
112112

113113
run_spark_commands(
114114
spark,
@@ -132,13 +132,13 @@ def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCa
132132
tbl = session_catalog.load_table(identifier)
133133
tbl.delete(EqualTo("number_partitioned", 22)) # Does not affect any data
134134

135-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append']
136-
assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10, 10], 'number': [20, 30]}
135+
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append"]
136+
assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10, 10], "number": [20, 30]}
137137

138138

139139
@pytest.mark.integration
140140
def test_partitioned_table_positional_deletes(spark: SparkSession, session_catalog: RestCatalog) -> None:
141-
identifier = 'default.table_partitioned_delete'
141+
identifier = "default.table_partitioned_delete"
142142

143143
run_spark_commands(
144144
spark,
@@ -180,13 +180,13 @@ def test_partitioned_table_positional_deletes(spark: SparkSession, session_catal
180180

181181
# One positional delete has been added, but an OVERWRITE status is set
182182
# https://github.com/apache/iceberg/issues/10122
183-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'overwrite', 'overwrite']
184-
assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10], 'number': [20]}
183+
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "overwrite", "overwrite"]
184+
assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10], "number": [20]}
185185

186186

187187
@pytest.mark.integration
188188
def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSession, session_catalog: RestCatalog) -> None:
189-
identifier = 'default.table_partitioned_delete_sequence_number'
189+
identifier = "default.table_partitioned_delete_sequence_number"
190190

191191
# This test case is a bit more complex. Here we run a MoR delete on a file, we make sure that
192192
# the manifest gets rewritten (but not the data file with a MoR), and check if the delete is still there
@@ -234,40 +234,40 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
234234
assert len(snapshots) == 3
235235

236236
# Snapshots produced by Spark
237-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ['append', 'overwrite']
237+
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ["append", "overwrite"]
238238

239239
# Will rewrite one parquet file
240240
assert snapshots[2].summary == Summary(
241241
Operation.OVERWRITE,
242242
**{
243-
'added-files-size': '1145',
244-
'added-data-files': '1',
245-
'added-records': '2',
246-
'changed-partition-count': '1',
247-
'total-files-size': snapshots[2].summary['total-files-size'],
248-
'total-delete-files': '0',
249-
'total-data-files': '1',
250-
'total-position-deletes': '0',
251-
'total-records': '2',
252-
'total-equality-deletes': '0',
253-
'deleted-data-files': '2',
254-
'removed-delete-files': '1',
255-
'deleted-records': '5',
256-
'removed-files-size': snapshots[2].summary['removed-files-size'],
257-
'removed-position-deletes': '1',
243+
"added-files-size": "1145",
244+
"added-data-files": "1",
245+
"added-records": "2",
246+
"changed-partition-count": "1",
247+
"total-files-size": snapshots[2].summary["total-files-size"],
248+
"total-delete-files": "0",
249+
"total-data-files": "1",
250+
"total-position-deletes": "0",
251+
"total-records": "2",
252+
"total-equality-deletes": "0",
253+
"deleted-data-files": "2",
254+
"removed-delete-files": "1",
255+
"deleted-records": "5",
256+
"removed-files-size": snapshots[2].summary["removed-files-size"],
257+
"removed-position-deletes": "1",
258258
},
259259
)
260260

261-
assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [20, 20, 10], 'number': [200, 202, 100]}
261+
assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [20, 20, 10], "number": [200, 202, 100]}
262262

263263

264264
@pytest.mark.integration
265265
def test_delete_no_match(session_catalog: RestCatalog) -> None:
266266
arrow_schema = pa.schema([pa.field("ints", pa.int32())])
267267
arrow_tbl = pa.Table.from_pylist(
268268
[
269-
{'ints': 1},
270-
{'ints': 3},
269+
{"ints": 1},
270+
{"ints": 3},
271271
],
272272
schema=arrow_schema,
273273
)
@@ -286,7 +286,7 @@ def test_delete_no_match(session_catalog: RestCatalog) -> None:
286286

287287
assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND]
288288

289-
tbl.delete('ints == 2') # Only 1 and 3 in the file, but is between the lower and upper bound
289+
tbl.delete("ints == 2") # Only 1 and 3 in the file, but is between the lower and upper bound
290290

291291
assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND]
292292

@@ -296,8 +296,8 @@ def test_delete_overwrite(session_catalog: RestCatalog) -> None:
296296
arrow_schema = pa.schema([pa.field("ints", pa.int32())])
297297
arrow_tbl = pa.Table.from_pylist(
298298
[
299-
{'ints': 1},
300-
{'ints': 2},
299+
{"ints": 1},
300+
{"ints": 2},
301301
],
302302
schema=arrow_schema,
303303
)
@@ -318,28 +318,28 @@ def test_delete_overwrite(session_catalog: RestCatalog) -> None:
318318

319319
arrow_tbl_overwrite = pa.Table.from_pylist(
320320
[
321-
{'ints': 3},
322-
{'ints': 4},
321+
{"ints": 3},
322+
{"ints": 4},
323323
],
324324
schema=arrow_schema,
325325
)
326-
tbl.overwrite(arrow_tbl_overwrite, 'ints == 2') # Should rewrite one file
326+
tbl.overwrite(arrow_tbl_overwrite, "ints == 2") # Should rewrite one file
327327

328328
assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [
329329
Operation.APPEND,
330330
Operation.OVERWRITE,
331331
Operation.APPEND,
332332
]
333333

334-
assert tbl.scan().to_arrow()['ints'].to_pylist() == [3, 4, 1]
334+
assert tbl.scan().to_arrow()["ints"].to_pylist() == [3, 4, 1]
335335

336336

337337
@pytest.mark.integration
338338
def test_delete_truncate(session_catalog: RestCatalog) -> None:
339339
arrow_schema = pa.schema([pa.field("ints", pa.int32())])
340340
arrow_tbl = pa.Table.from_pylist(
341341
[
342-
{'ints': 1},
342+
{"ints": 1},
343343
],
344344
schema=arrow_schema,
345345
)

tests/integration/test_inspect_table.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -102,38 +102,38 @@ def test_inspect_snapshots(
102102
for snapshot_id in df["snapshot_id"]:
103103
assert isinstance(snapshot_id.as_py(), int)
104104

105-
assert df['parent_id'][0].as_py() is None
106-
assert df['parent_id'][1:].to_pylist() == df['snapshot_id'][:-1].to_pylist()
105+
assert df["parent_id"][0].as_py() is None
106+
assert df["parent_id"][1:].to_pylist() == df["snapshot_id"][:-1].to_pylist()
107107

108-
assert [operation.as_py() for operation in df['operation']] == ['append', 'delete', 'append', 'append']
108+
assert [operation.as_py() for operation in df["operation"]] == ["append", "delete", "append", "append"]
109109

110110
for manifest_list in df["manifest_list"]:
111111
assert manifest_list.as_py().startswith("s3://")
112112

113113
# Append
114-
assert df['summary'][0].as_py() == [
115-
('added-files-size', '5459'),
116-
('added-data-files', '1'),
117-
('added-records', '3'),
118-
('total-data-files', '1'),
119-
('total-delete-files', '0'),
120-
('total-records', '3'),
121-
('total-files-size', '5459'),
122-
('total-position-deletes', '0'),
123-
('total-equality-deletes', '0'),
114+
assert df["summary"][0].as_py() == [
115+
("added-files-size", "5459"),
116+
("added-data-files", "1"),
117+
("added-records", "3"),
118+
("total-data-files", "1"),
119+
("total-delete-files", "0"),
120+
("total-records", "3"),
121+
("total-files-size", "5459"),
122+
("total-position-deletes", "0"),
123+
("total-equality-deletes", "0"),
124124
]
125125

126126
# Delete
127-
assert df['summary'][1].as_py() == [
128-
('removed-files-size', '5459'),
129-
('deleted-data-files', '1'),
130-
('deleted-records', '3'),
131-
('total-data-files', '0'),
132-
('total-delete-files', '0'),
133-
('total-records', '0'),
134-
('total-files-size', '0'),
135-
('total-position-deletes', '0'),
136-
('total-equality-deletes', '0'),
127+
assert df["summary"][1].as_py() == [
128+
("removed-files-size", "5459"),
129+
("deleted-data-files", "1"),
130+
("deleted-records", "3"),
131+
("total-data-files", "0"),
132+
("total-delete-files", "0"),
133+
("total-records", "0"),
134+
("total-files-size", "0"),
135+
("total-position-deletes", "0"),
136+
("total-equality-deletes", "0"),
137137
]
138138

139139
lhs = spark.table(f"{identifier}.snapshots").toPandas()

0 commit comments

Comments
 (0)