Skip to content

Commit a97c45a

Browse files
committed
WIP
1 parent 172f9c0 commit a97c45a

File tree

4 files changed

+66
-61
lines changed

4 files changed

+66
-61
lines changed

pyiceberg/table/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,8 +1956,7 @@ def union_by_name(self, new_schema: Union[Schema, "pa.Schema"]) -> UpdateSchema:
19561956
visit_with_partner(
19571957
Catalog._convert_schema_if_needed(new_schema),
19581958
-1,
1959-
UnionByNameVisitor(update_schema=self, existing_schema=self._schema, case_sensitive=self._case_sensitive),
1960-
# type: ignore
1959+
UnionByNameVisitor(update_schema=self, existing_schema=self._schema, case_sensitive=self._case_sensitive), # type: ignore
19611960
PartnerIdByNameAccessor(partner_schema=self._schema, case_sensitive=self._case_sensitive),
19621961
)
19631962
return self

tests/conftest.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2064,36 +2064,6 @@ def spark() -> "SparkSession":
20642064
return spark
20652065

20662066

2067-
TEST_DATA_WITH_NULL = {
2068-
'bool': [False, None, True],
2069-
'string': ['a', None, 'z'],
2070-
# Go over the 16 bytes to kick in truncation
2071-
'string_long': ['a' * 22, None, 'z' * 22],
2072-
'int': [1, None, 9],
2073-
'long': [1, None, 9],
2074-
'float': [0.0, None, 0.9],
2075-
'double': [0.0, None, 0.9],
2076-
# 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields
2077-
'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
2078-
'timestamptz': [
2079-
datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
2080-
None,
2081-
datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
2082-
],
2083-
'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
2084-
# Not supported by Spark
2085-
# 'time': [time(1, 22, 0), None, time(19, 25, 0)],
2086-
# Not natively supported by Arrow
2087-
# 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
2088-
'binary': [b'\01', None, b'\22'],
2089-
'fixed': [
2090-
uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
2091-
None,
2092-
uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
2093-
],
2094-
}
2095-
2096-
20972067
@pytest.fixture(scope="session")
20982068
def pa_schema() -> "pa.Schema":
20992069
import pyarrow as pa
@@ -2125,7 +2095,37 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table":
21252095
"""Pyarrow table with all kinds of columns."""
21262096
import pyarrow as pa
21272097

2128-
return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema)
2098+
return pa.Table.from_pydict(
2099+
{
2100+
'bool': [False, None, True],
2101+
'string': ['a', None, 'z'],
2102+
# Go over the 16 bytes to kick in truncation
2103+
'string_long': ['a' * 22, None, 'z' * 22],
2104+
'int': [1, None, 9],
2105+
'long': [1, None, 9],
2106+
'float': [0.0, None, 0.9],
2107+
'double': [0.0, None, 0.9],
2108+
# 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields
2109+
'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
2110+
'timestamptz': [
2111+
datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
2112+
None,
2113+
datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
2114+
],
2115+
'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
2116+
# Not supported by Spark
2117+
# 'time': [time(1, 22, 0), None, time(19, 25, 0)],
2118+
# Not natively supported by Arrow
2119+
# 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
2120+
'binary': [b'\01', None, b'\22'],
2121+
'fixed': [
2122+
uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
2123+
None,
2124+
uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
2125+
],
2126+
},
2127+
schema=pa_schema,
2128+
)
21292129

21302130

21312131
@pytest.fixture(scope="session")

tests/integration/test_writes/test_partitioned_writes.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
TruncateTransform,
3333
YearTransform,
3434
)
35-
from tests.conftest import TEST_DATA_WITH_NULL
3635
from utils import TABLE_SCHEMA, _create_table
3736

3837

@@ -64,7 +63,7 @@ def test_query_filter_null_partitioned(
6463
assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
6564
df = spark.table(identifier)
6665
assert df.count() == 3, f"Expected 3 total rows for {identifier}"
67-
for col in TEST_DATA_WITH_NULL.keys():
66+
for col in arrow_table_with_null.column_names:
6867
assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}"
6968
assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null"
7069

@@ -75,7 +74,12 @@ def test_query_filter_null_partitioned(
7574
)
7675
@pytest.mark.parametrize("format_version", [1, 2])
7776
def test_query_filter_without_data_partitioned(
78-
session_catalog: Catalog, spark: SparkSession, arrow_table_without_data: pa.Table, part_col: str, format_version: int
77+
session_catalog: Catalog,
78+
spark: SparkSession,
79+
arrow_table_without_data: pa.Table,
80+
part_col: str,
81+
arrow_table_with_null: pa.Table,
82+
format_version: int,
7983
) -> None:
8084
# Given
8185
identifier = f"default.arrow_table_v{format_version}_without_data_partitioned_on_col_{part_col}"
@@ -96,7 +100,7 @@ def test_query_filter_without_data_partitioned(
96100
# Then
97101
assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
98102
df = spark.table(identifier)
99-
for col in TEST_DATA_WITH_NULL.keys():
103+
for col in arrow_table_with_null.column_names:
100104
assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
101105
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
102106

@@ -128,7 +132,7 @@ def test_query_filter_only_nulls_partitioned(
128132
# Then
129133
assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
130134
df = spark.table(identifier)
131-
for col in TEST_DATA_WITH_NULL.keys():
135+
for col in arrow_table_with_only_nulls.column_names:
132136
assert df.where(f"{col} is null").count() == 2, f"Expected 2 row for {col}"
133137
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 rows for {col}"
134138

@@ -163,7 +167,7 @@ def test_query_filter_appended_null_partitioned(
163167
# Then
164168
assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
165169
df = spark.table(identifier)
166-
for col in TEST_DATA_WITH_NULL.keys():
170+
for col in arrow_table_with_null.column_names:
167171
df = spark.table(identifier)
168172
assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}"
169173
assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}"
@@ -207,7 +211,7 @@ def test_query_filter_v1_v2_append_null(
207211

208212
# Then
209213
assert tbl.format_version == 2, f"Expected v2, got: v{tbl.format_version}"
210-
for col in TEST_DATA_WITH_NULL.keys(): # type: ignore
214+
for col in arrow_table_with_null.column_names: # type: ignore
211215
df = spark.table(identifier)
212216
assert df.where(f"{col} is not null").count() == 4, f"Expected 4 non-null rows for {col}"
213217
assert df.where(f"{col} is null").count() == 2, f"Expected 2 null rows for {col}"

tests/integration/test_writes/test_writes.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from pyiceberg.exceptions import NoSuchTableError
3939
from pyiceberg.io.pyarrow import _dataframe_to_data_files
4040
from pyiceberg.table import TableProperties
41-
from tests.conftest import TEST_DATA_WITH_NULL
4241
from utils import _create_table
4342

4443

@@ -120,52 +119,55 @@ def test_query_count(spark: SparkSession, format_version: int) -> None:
120119

121120

122121
@pytest.mark.integration
123-
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
124122
@pytest.mark.parametrize("format_version", [1, 2])
125-
def test_query_filter_null(spark: SparkSession, col: str, format_version: int) -> None:
123+
def test_query_filter_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
126124
identifier = f"default.arrow_table_v{format_version}_with_null"
127125
df = spark.table(identifier)
128-
assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}"
129-
assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
126+
for col in arrow_table_with_null.column_names:
127+
assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}"
128+
assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
130129

131130

132131
@pytest.mark.integration
133-
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
134132
@pytest.mark.parametrize("format_version", [1, 2])
135-
def test_query_filter_without_data(spark: SparkSession, col: str, format_version: int) -> None:
133+
def test_query_filter_without_data(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
136134
identifier = f"default.arrow_table_v{format_version}_without_data"
137135
df = spark.table(identifier)
138-
assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
139-
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
136+
for col in arrow_table_with_null.column_names:
137+
assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
138+
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
140139

141140

142141
@pytest.mark.integration
143-
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
144142
@pytest.mark.parametrize("format_version", [1, 2])
145-
def test_query_filter_only_nulls(spark: SparkSession, col: str, format_version: int) -> None:
143+
def test_query_filter_only_nulls(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
146144
identifier = f"default.arrow_table_v{format_version}_with_only_nulls"
147145
df = spark.table(identifier)
148-
assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}"
149-
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
146+
for col in arrow_table_with_null.column_names:
147+
assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}"
148+
assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
150149

151150

152151
@pytest.mark.integration
153-
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
154152
@pytest.mark.parametrize("format_version", [1, 2])
155-
def test_query_filter_appended_null(spark: SparkSession, col: str, format_version: int) -> None:
153+
def test_query_filter_appended_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
156154
identifier = f"default.arrow_table_v{format_version}_appended_with_null"
157155
df = spark.table(identifier)
158-
assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
159-
assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
156+
for col in arrow_table_with_null.column_names:
157+
assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
158+
assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
160159

161160

162161
@pytest.mark.integration
163-
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
164-
def test_query_filter_v1_v2_append_null(spark: SparkSession, col: str) -> None:
162+
def test_query_filter_v1_v2_append_null(
163+
spark: SparkSession,
164+
arrow_table_with_null: pa.Table,
165+
) -> None:
165166
identifier = "default.arrow_table_v1_v2_appended_with_null"
166167
df = spark.table(identifier)
167-
assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
168-
assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
168+
for col in arrow_table_with_null.column_names:
169+
assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
170+
assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
169171

170172

171173
@pytest.mark.integration

0 commit comments

Comments
 (0)