|
45 | 45 | pyarrow_to_schema,
|
46 | 46 | schema_to_pyarrow,
|
47 | 47 | visit_pyarrow,
|
| 48 | + ORC_FIELD_ID_KEY, |
48 | 49 | )
|
49 | 50 | from pyiceberg.schema import Accessor, Schema, visit
|
50 | 51 | from pyiceberg.table.name_mapping import MappedField, NameMapping
|
@@ -239,6 +240,64 @@ def test_pyarrow_variable_binary_to_iceberg(pyarrow_type: pa.DataType) -> None:
|
239 | 240 | assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.large_binary()
|
240 | 241 |
|
241 | 242 |
|
| 243 | +def test_orc_vs_parquet_metadata_handling() -> None: |
| 244 | + """Test that ORC and Parquet metadata use different keys but produce equivalent schemas.""" |
| 245 | + # ORC struct with ORC metadata |
| 246 | + orc_struct = pa.struct( |
| 247 | + [ |
| 248 | + pa.field("foo", pa.string(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "1", "doc": "foo doc"}), |
| 249 | + pa.field("bar", pa.int32(), nullable=False, metadata={ORC_FIELD_ID_KEY.decode(): "2"}), |
| 250 | + ] |
| 251 | + ) |
| 252 | + |
| 253 | + # Parquet struct with Parquet metadata (same structure, different metadata keys) |
| 254 | + parquet_struct = pa.struct( |
| 255 | + [ |
| 256 | + pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), |
| 257 | + pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}), |
| 258 | + ] |
| 259 | + ) |
| 260 | + |
| 261 | + # Both should produce the same Iceberg schema |
| 262 | + orc_result = visit_pyarrow(orc_struct, _ConvertToIceberg()) |
| 263 | + parquet_result = visit_pyarrow(parquet_struct, _ConvertToIceberg()) |
| 264 | + |
| 265 | + assert orc_result == parquet_result |
| 266 | + |
| 267 | + # Verify they use different metadata keys |
| 268 | + assert ORC_FIELD_ID_KEY.decode() == "iceberg.id" |
| 269 | + assert "PARQUET:field_id" != ORC_FIELD_ID_KEY.decode() |
| 270 | + |
| 271 | + # Verify the metadata keys are present in the respective structs (using bytes keys) |
| 272 | + assert ORC_FIELD_ID_KEY in orc_struct[0].metadata # b'iceberg.id' |
| 273 | + assert b"PARQUET:field_id" in parquet_struct[0].metadata |
| 274 | + assert ORC_FIELD_ID_KEY not in parquet_struct[0].metadata |
| 275 | + assert b"PARQUET:field_id" not in orc_struct[0].metadata |
| 276 | + |
| 277 | + |
| 278 | +def test_pyarrow_struct_to_iceberg_orc() -> None: |
| 279 | + pyarrow_struct = pa.struct( |
| 280 | + [ |
| 281 | + pa.field("foo", pa.string(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "1", "doc": "foo doc"}), |
| 282 | + pa.field("bar", pa.int32(), nullable=False, metadata={ORC_FIELD_ID_KEY.decode(): "2"}), |
| 283 | + pa.field("baz", pa.bool_(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "3"}), |
| 284 | + ] |
| 285 | + ) |
| 286 | + expected = StructType( |
| 287 | + NestedField(id=1, name="foo", field_type=StringType(), required=False, doc="foo doc"), |
| 288 | + NestedField(id=2, name="bar", field_type=IntegerType(), required=True), |
| 289 | + NestedField(id=3, name="baz", field_type=BooleanType(), required=False), |
| 290 | + ) |
| 291 | + result = visit_pyarrow(pyarrow_struct, _ConvertToIceberg()) |
| 292 | + assert result == expected |
| 293 | + |
| 294 | + # Validate that field IDs were extracted from ORC metadata |
| 295 | + assert result.fields[0].field_id == 1 # foo |
| 296 | + assert result.fields[1].field_id == 2 # bar |
| 297 | + assert result.fields[2].field_id == 3 # baz |
| 298 | + |
| 299 | + |
| 300 | + |
242 | 301 | def test_pyarrow_struct_to_iceberg() -> None:
|
243 | 302 | pyarrow_struct = pa.struct(
|
244 | 303 | [
|
|
0 commit comments