fix: sanitize invalid Avro field names in manifest file (#2245)

kris-gaudel · kevinjqliu · web-flow · commit b6a45ed70676 · 2025-08-06T14:12:32.000-07:00
Closes #2123 # Rationale for this change Fixing sanitization behaviour to match specification and Java implementation # Are these changes tested? Yes - Unit and integration tests # Are there any user-facing changes? Yes - Field names will be sanitized to be Avro compatible if not already  --------- Co-authored-by: Kevin Liu <kevinjqliu@users.noreply.github.com>
diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py
@@ -78,6 +78,9 @@
 
 INITIAL_SCHEMA_ID = 0
 
+FIELD_ID_PROP = "field-id"
+ICEBERG_FIELD_NAME_PROP = "iceberg-field-name"
+
 
 class Schema(IcebergBaseModel):
     """A table Schema.
@@ -1356,6 +1359,21 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType:
 
 # Implementation copied from Apache Iceberg repo.
 def make_compatible_name(name: str) -> str:
+    """Make a field name compatible with Avro specification.
+
+    This function sanitizes field names to comply with Avro naming rules:
+    - Names must start with [A-Za-z_]
+    - Subsequent characters must be [A-Za-z0-9_]
+
+    Invalid characters are replaced with _xHHHH where HHHH is the hex code.
+    Names starting with digits get a leading underscore.
+
+    Args:
+        name: The original field name
+
+    Returns:
+        A sanitized name that complies with Avro specification
+    """
     if not _valid_avro_name(name):
         return _sanitize_name(name)
     return name
@@ -1391,7 +1409,9 @@ def _sanitize_name(name: str) -> str:
 
 
 def _sanitize_char(character: str) -> str:
-    return "_" + character if character.isdigit() else "_x" + hex(ord(character))[2:].upper()
+    if character.isdigit():
+        return "_" + character
+    return "_x" + hex(ord(character))[2:].upper()
 
 
 def sanitize_column_names(schema: Schema) -> Schema:
diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py
@@ -26,7 +26,14 @@
     Union,
 )
 
-from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
+from pyiceberg.schema import (
+    FIELD_ID_PROP,
+    ICEBERG_FIELD_NAME_PROP,
+    Schema,
+    SchemaVisitorPerPrimitiveType,
+    make_compatible_name,
+    visit,
+)
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -225,13 +232,13 @@ def _convert_field(self, field: Dict[str, Any]) -> NestedField:
         Returns:
             The Iceberg equivalent field.
         """
-        if "field-id" not in field:
-            raise ValueError(f"Cannot convert field, missing field-id: {field}")
+        if FIELD_ID_PROP not in field:
+            raise ValueError(f"Cannot convert field, missing {FIELD_ID_PROP}: {field}")
 
         plain_type, required = self._resolve_union(field["type"])
 
         return NestedField(
-            field_id=field["field-id"],
+            field_id=field[FIELD_ID_PROP],
             name=field["name"],
             field_type=self._convert_schema(plain_type),
             required=required,
@@ -524,12 +531,18 @@ def field(self, field: NestedField, field_result: AvroType) -> AvroType:
         if isinstance(field_result, dict) and field_result.get("type") == "record":
             field_result["name"] = f"r{field.field_id}"
 
+        original_name = field.name
+        sanitized_name = make_compatible_name(original_name)
+
         result = {
-            "name": field.name,
-            "field-id": field.field_id,
+            "name": sanitized_name,
+            FIELD_ID_PROP: field.field_id,
             "type": field_result if field.required else ["null", field_result],
         }
 
+        if original_name != sanitized_name:
+            result[ICEBERG_FIELD_NAME_PROP] = original_name
+
         if field.write_default is not None:
             result["default"] = field.write_default
         elif field.optional:
@@ -564,8 +577,8 @@ def map(self, map_type: MapType, key_result: AvroType, value_result: AvroType) -
                     "type": "record",
                     "name": f"k{self.last_map_key_field_id}_v{self.last_map_value_field_id}",
                     "fields": [
-                        {"name": "key", "type": key_result, "field-id": self.last_map_key_field_id},
-                        {"name": "value", "type": value_result, "field-id": self.last_map_value_field_id},
+                        {"name": "key", "type": key_result, FIELD_ID_PROP: self.last_map_key_field_id},
+                        {"name": "value", "type": value_result, FIELD_ID_PROP: self.last_map_value_field_id},
                     ],
                 },
                 "logicalType": "map",
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -1201,6 +1201,137 @@ def test_sanitize_character_partitioned(catalog: Catalog) -> None:
     assert len(tbl.scan().to_arrow()) == 22
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
+def test_sanitize_character_partitioned_avro_bug(catalog: Catalog) -> None:
+    table_name = "default.test_table_partitioned_sanitized_character_avro"
+    try:
+        catalog.drop_table(table_name)
+    except NoSuchTableError:
+        pass
+
+    schema = Schema(
+        NestedField(id=1, name="😎", field_type=StringType(), required=False),
+    )
+
+    partition_spec = PartitionSpec(
+        PartitionField(
+            source_id=1,
+            field_id=1001,
+            transform=IdentityTransform(),
+            name="😎",
+        )
+    )
+
+    tbl = _create_table(
+        session_catalog=catalog,
+        identifier=table_name,
+        schema=schema,
+        partition_spec=partition_spec,
+        data=[
+            pa.Table.from_arrays(
+                [pa.array([str(i) for i in range(22)])], schema=pa.schema([pa.field("😎", pa.string(), nullable=False)])
+            )
+        ],
+    )
+
+    assert len(tbl.scan().to_arrow()) == 22
+
+    # verify that we can read the table with DuckDB
+    import duckdb
+
+    location = tbl.metadata_location
+    duckdb.sql("INSTALL iceberg; LOAD iceberg;")
+    # Configure S3 settings for DuckDB to match the catalog configuration
+    duckdb.sql("SET s3_endpoint='localhost:9000';")
+    duckdb.sql("SET s3_access_key_id='admin';")
+    duckdb.sql("SET s3_secret_access_key='password';")
+    duckdb.sql("SET s3_use_ssl=false;")
+    duckdb.sql("SET s3_url_style='path';")
+    result = duckdb.sql(f"SELECT * FROM iceberg_scan('{location}')").fetchall()
+    assert len(result) == 22
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_cross_platform_special_character_compatibility(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    """Test cross-platform compatibility with special characters in column names."""
+    identifier = "default.test_cross_platform_special_characters"
+
+    # Test various special characters that need sanitization
+    special_characters = [
+        "😎",  # emoji - Java produces _xD83D_xDE0E, Python produces _x1F60E
+        "a.b",  # dot - both should produce a_x2Eb
+        "a#b",  # hash - both should produce a_x23b
+        "9x",  # starts with digit - both should produce _9x
+        "x_",  # valid - should remain unchanged
+        "letter/abc",  # slash - both should produce letter_x2Fabc
+    ]
+
+    for i, special_char in enumerate(special_characters):
+        table_name = f"{identifier}_{format_version}_{i}"
+        pyiceberg_table_name = f"{identifier}_pyiceberg_{format_version}_{i}"
+
+        try:
+            session_catalog.drop_table(table_name)
+        except Exception:
+            pass
+        try:
+            session_catalog.drop_table(pyiceberg_table_name)
+        except Exception:
+            pass
+
+        try:
+            # Test 1: Spark writes, PyIceberg reads
+            spark_df = spark.createDataFrame([("test_value",)], [special_char])
+            spark_df.writeTo(table_name).using("iceberg").createOrReplace()
+
+            # Read with PyIceberg table scan
+            tbl = session_catalog.load_table(table_name)
+            pyiceberg_df = tbl.scan().to_pandas()
+            assert len(pyiceberg_df) == 1
+            assert special_char in pyiceberg_df.columns
+            assert pyiceberg_df.iloc[0][special_char] == "test_value"
+
+            # Test 2: PyIceberg writes, Spark reads
+            from pyiceberg.schema import Schema
+            from pyiceberg.types import NestedField, StringType
+
+            schema = Schema(NestedField(field_id=1, name=special_char, field_type=StringType(), required=True))
+
+            tbl_pyiceberg = session_catalog.create_table(
+                identifier=pyiceberg_table_name, schema=schema, properties={"format-version": str(format_version)}
+            )
+
+            import pyarrow as pa
+
+            # Create PyArrow schema with required field to match Iceberg schema
+            pa_schema = pa.schema([pa.field(special_char, pa.string(), nullable=False)])
+            data = pa.Table.from_pydict({special_char: ["pyiceberg_value"]}, schema=pa_schema)
+            tbl_pyiceberg.append(data)
+
+            # Read with Spark
+            spark_df_read = spark.table(pyiceberg_table_name)
+            spark_result = spark_df_read.collect()
+
+            # Verify data integrity
+            assert len(spark_result) == 1
+            assert special_char in spark_df_read.columns
+            assert spark_result[0][special_char] == "pyiceberg_value"
+
+        finally:
+            try:
+                session_catalog.drop_table(table_name)
+            except Exception:
+                pass
+            try:
+                session_catalog.drop_table(pyiceberg_table_name)
+            except Exception:
+                pass
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None:
diff --git a/tests/test_avro_sanitization.py b/tests/test_avro_sanitization.py