WIP

Fokko · Fokko · commit a97c45aea808 · 2024-04-16T23:31:23.000+02:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1956,8 +1956,7 @@ def union_by_name(self, new_schema: Union[Schema, "pa.Schema"]) -> UpdateSchema:
         visit_with_partner(
             Catalog._convert_schema_if_needed(new_schema),
             -1,
-            UnionByNameVisitor(update_schema=self, existing_schema=self._schema, case_sensitive=self._case_sensitive),
-            # type: ignore
+            UnionByNameVisitor(update_schema=self, existing_schema=self._schema, case_sensitive=self._case_sensitive),  # type: ignore
             PartnerIdByNameAccessor(partner_schema=self._schema, case_sensitive=self._case_sensitive),
         )
         return self
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2064,36 +2064,6 @@ def spark() -> "SparkSession":
     return spark
 
 
-TEST_DATA_WITH_NULL = {
-    'bool': [False, None, True],
-    'string': ['a', None, 'z'],
-    # Go over the 16 bytes to kick in truncation
-    'string_long': ['a' * 22, None, 'z' * 22],
-    'int': [1, None, 9],
-    'long': [1, None, 9],
-    'float': [0.0, None, 0.9],
-    'double': [0.0, None, 0.9],
-    # 'time': [1_000_000, None, 3_000_000],  # Example times: 1s, none, and 3s past midnight #Spark does not support time fields
-    'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
-    'timestamptz': [
-        datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
-        None,
-        datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
-    ],
-    'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
-    # Not supported by Spark
-    # 'time': [time(1, 22, 0), None, time(19, 25, 0)],
-    # Not natively supported by Arrow
-    # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
-    'binary': [b'\01', None, b'\22'],
-    'fixed': [
-        uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
-        None,
-        uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
-    ],
-}
-
-
 @pytest.fixture(scope="session")
 def pa_schema() -> "pa.Schema":
     import pyarrow as pa
@@ -2125,7 +2095,37 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table":
     """Pyarrow table with all kinds of columns."""
     import pyarrow as pa
 
-    return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema)
+    return pa.Table.from_pydict(
+        {
+            'bool': [False, None, True],
+            'string': ['a', None, 'z'],
+            # Go over the 16 bytes to kick in truncation
+            'string_long': ['a' * 22, None, 'z' * 22],
+            'int': [1, None, 9],
+            'long': [1, None, 9],
+            'float': [0.0, None, 0.9],
+            'double': [0.0, None, 0.9],
+            # 'time': [1_000_000, None, 3_000_000],  # Example times: 1s, none, and 3s past midnight #Spark does not support time fields
+            'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+            'timestamptz': [
+                datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
+                None,
+                datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
+            ],
+            'date': [date(2023, 1, 1), None, date(2023, 3, 1)],
+            # Not supported by Spark
+            # 'time': [time(1, 22, 0), None, time(19, 25, 0)],
+            # Not natively supported by Arrow
+            # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
+            'binary': [b'\01', None, b'\22'],
+            'fixed': [
+                uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
+                None,
+                uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
+            ],
+        },
+        schema=pa_schema,
+    )
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py
@@ -32,7 +32,6 @@
     TruncateTransform,
     YearTransform,
 )
-from tests.conftest import TEST_DATA_WITH_NULL
 from utils import TABLE_SCHEMA, _create_table
 
 
@@ -64,7 +63,7 @@ def test_query_filter_null_partitioned(
     assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
     df = spark.table(identifier)
     assert df.count() == 3, f"Expected 3 total rows for {identifier}"
-    for col in TEST_DATA_WITH_NULL.keys():
+    for col in arrow_table_with_null.column_names:
         assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}"
         assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null"
 
@@ -75,7 +74,12 @@ def test_query_filter_null_partitioned(
 )
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_query_filter_without_data_partitioned(
-    session_catalog: Catalog, spark: SparkSession, arrow_table_without_data: pa.Table, part_col: str, format_version: int
+    session_catalog: Catalog,
+    spark: SparkSession,
+    arrow_table_without_data: pa.Table,
+    part_col: str,
+    arrow_table_with_null: pa.Table,
+    format_version: int,
 ) -> None:
     # Given
     identifier = f"default.arrow_table_v{format_version}_without_data_partitioned_on_col_{part_col}"
@@ -96,7 +100,7 @@ def test_query_filter_without_data_partitioned(
     # Then
     assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
     df = spark.table(identifier)
-    for col in TEST_DATA_WITH_NULL.keys():
+    for col in arrow_table_with_null.column_names:
         assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
         assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
 
@@ -128,7 +132,7 @@ def test_query_filter_only_nulls_partitioned(
     # Then
     assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
     df = spark.table(identifier)
-    for col in TEST_DATA_WITH_NULL.keys():
+    for col in arrow_table_with_only_nulls.column_names:
         assert df.where(f"{col} is null").count() == 2, f"Expected 2 row for {col}"
         assert df.where(f"{col} is not null").count() == 0, f"Expected 0 rows for {col}"
 
@@ -163,7 +167,7 @@ def test_query_filter_appended_null_partitioned(
     # Then
     assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}"
     df = spark.table(identifier)
-    for col in TEST_DATA_WITH_NULL.keys():
+    for col in arrow_table_with_null.column_names:
         df = spark.table(identifier)
         assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}"
         assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}"
@@ -207,7 +211,7 @@ def test_query_filter_v1_v2_append_null(
 
     # Then
     assert tbl.format_version == 2, f"Expected v2, got: v{tbl.format_version}"
-    for col in TEST_DATA_WITH_NULL.keys():  # type: ignore
+    for col in arrow_table_with_null.column_names:  # type: ignore
         df = spark.table(identifier)
         assert df.where(f"{col} is not null").count() == 4, f"Expected 4 non-null rows for {col}"
         assert df.where(f"{col} is null").count() == 2, f"Expected 2 null rows for {col}"
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -38,7 +38,6 @@
 from pyiceberg.exceptions import NoSuchTableError
 from pyiceberg.io.pyarrow import _dataframe_to_data_files
 from pyiceberg.table import TableProperties
-from tests.conftest import TEST_DATA_WITH_NULL
 from utils import _create_table
 
 
@@ -120,52 +119,55 @@ def test_query_count(spark: SparkSession, format_version: int) -> None:
 
 
 @pytest.mark.integration
-@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
 @pytest.mark.parametrize("format_version", [1, 2])
-def test_query_filter_null(spark: SparkSession, col: str, format_version: int) -> None:
+def test_query_filter_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
     identifier = f"default.arrow_table_v{format_version}_with_null"
     df = spark.table(identifier)
-    assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}"
-    assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
+    for col in arrow_table_with_null.column_names:
+        assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}"
+        assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
 
 
 @pytest.mark.integration
-@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
 @pytest.mark.parametrize("format_version", [1, 2])
-def test_query_filter_without_data(spark: SparkSession, col: str, format_version: int) -> None:
+def test_query_filter_without_data(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
     identifier = f"default.arrow_table_v{format_version}_without_data"
     df = spark.table(identifier)
-    assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
-    assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
+    for col in arrow_table_with_null.column_names:
+        assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}"
+        assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
 
 
 @pytest.mark.integration
-@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
 @pytest.mark.parametrize("format_version", [1, 2])
-def test_query_filter_only_nulls(spark: SparkSession, col: str, format_version: int) -> None:
+def test_query_filter_only_nulls(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
     identifier = f"default.arrow_table_v{format_version}_with_only_nulls"
     df = spark.table(identifier)
-    assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}"
-    assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
+    for col in arrow_table_with_null.column_names:
+        assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}"
+        assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}"
 
 
 @pytest.mark.integration
-@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
 @pytest.mark.parametrize("format_version", [1, 2])
-def test_query_filter_appended_null(spark: SparkSession, col: str, format_version: int) -> None:
+def test_query_filter_appended_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None:
     identifier = f"default.arrow_table_v{format_version}_appended_with_null"
     df = spark.table(identifier)
-    assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
-    assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
+    for col in arrow_table_with_null.column_names:
+        assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
+        assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
 
 
 @pytest.mark.integration
-@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
-def test_query_filter_v1_v2_append_null(spark: SparkSession, col: str) -> None:
+def test_query_filter_v1_v2_append_null(
+    spark: SparkSession,
+    arrow_table_with_null: pa.Table,
+) -> None:
     identifier = "default.arrow_table_v1_v2_appended_with_null"
     df = spark.table(identifier)
-    assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
-    assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
+    for col in arrow_table_with_null.column_names:
+        assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}"
+        assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}"
 
 
 @pytest.mark.integration