Skip to content

Commit daf4d72

Browse files
committed
Fix tests in record_batch_transformer.rs to match new behavior.
1 parent 2938393 commit daf4d72

File tree

1 file changed

+5
-127
lines changed

1 file changed

+5
-127
lines changed

crates/iceberg/src/arrow/record_batch_transformer.rs

Lines changed: 5 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,7 +1335,7 @@ mod test {
13351335
/// - Iceberg spec: format/spec.md "Column Projection" section
13361336
#[test]
13371337
fn test_all_four_spec_rules() {
1338-
use crate::spec::{MappedField, NameMapping, Transform};
1338+
use crate::spec::Transform;
13391339

13401340
// Iceberg schema with columns designed to exercise each spec rule
13411341
let snapshot_schema = Arc::new(
@@ -1346,7 +1346,7 @@ mod test {
13461346
NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
13471347
// Rule #1: Identity-partitioned field - should use partition metadata
13481348
NestedField::required(2, "dept", Type::Primitive(PrimitiveType::String)).into(),
1349-
// Rule #2: Field resolved by name mapping (no field ID in Parquet)
1349+
// Rule #2: Field resolved by name mapping (ArrowReader already applied)
13501350
NestedField::required(3, "data", Type::Primitive(PrimitiveType::String)).into(),
13511351
// Rule #3: Field with initial_default
13521352
NestedField::optional(4, "category", Type::Primitive(PrimitiveType::String))
@@ -1373,20 +1373,14 @@ mod test {
13731373
// Partition data: dept="engineering"
13741374
let partition_data = Struct::from_iter(vec![Some(Literal::string("engineering"))]);
13751375

1376-
// Parquet schema: has id (with field_id=1) and data (without field ID)
1376+
// Parquet schema: simulates post-ArrowReader state where name mapping already applied
1377+
// Has id (field_id=1) and data (field_id=3, assigned by ArrowReader via name mapping)
13771378
// Missing: dept (in partition), category (has default), notes (no default)
13781379
let parquet_schema = Arc::new(ArrowSchema::new(vec![
13791380
simple_field("id", DataType::Int32, false, "1"),
1380-
Field::new("data", DataType::Utf8, false), // No field ID - needs name mapping
1381+
simple_field("data", DataType::Utf8, false, "3"),
13811382
]));
13821383

1383-
// Name mapping: maps field ID 3 to "data" column
1384-
let name_mapping = Arc::new(NameMapping::new(vec![MappedField::new(
1385-
Some(3),
1386-
vec!["data".to_string()],
1387-
vec![],
1388-
)]));
1389-
13901384
let projected_field_ids = [1, 2, 3, 4, 5]; // id, dept, data, category, notes
13911385

13921386
let mut transformer =
@@ -1452,120 +1446,4 @@ mod test {
14521446
assert!(notes_column.is_null(0));
14531447
assert!(notes_column.is_null(1));
14541448
}
1455-
1456-
/// Verifies field ID conflict detection for add_files imports.
1457-
///
1458-
/// Why: add_files can import Parquet with conflicting field IDs (field_id=1->"name" in Parquet
1459-
/// vs field_id=1->"id" in Iceberg). Name-checking detects conflicts, treats fields as "not present",
1460-
/// allowing spec fallback to partition constants and name mapping.
1461-
///
1462-
/// Reproduces: TestAddFilesProcedure.addDataPartitioned
1463-
#[test]
1464-
fn add_files_with_field_id_conflicts_like_java_test() {
1465-
use crate::spec::{MappedField, NameMapping, Struct, Transform};
1466-
1467-
// Iceberg schema (field IDs 1-4)
1468-
let snapshot_schema = Arc::new(
1469-
Schema::builder()
1470-
.with_schema_id(0)
1471-
.with_fields(vec![
1472-
NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
1473-
NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(),
1474-
NestedField::required(3, "dept", Type::Primitive(PrimitiveType::String)).into(),
1475-
NestedField::required(4, "subdept", Type::Primitive(PrimitiveType::String))
1476-
.into(),
1477-
])
1478-
.build()
1479-
.unwrap(),
1480-
);
1481-
1482-
let partition_spec = Arc::new(
1483-
crate::spec::PartitionSpec::builder(snapshot_schema.clone())
1484-
.with_spec_id(0)
1485-
.add_partition_field("id", "id", Transform::Identity)
1486-
.unwrap()
1487-
.build()
1488-
.unwrap(),
1489-
);
1490-
1491-
let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]);
1492-
1493-
// Parquet schema with CONFLICTING field IDs (1-3 instead of 1-4)
1494-
let parquet_schema = Arc::new(ArrowSchema::new(vec![
1495-
simple_field("name", DataType::Utf8, false, "1"),
1496-
simple_field("dept", DataType::Utf8, false, "2"),
1497-
simple_field("subdept", DataType::Utf8, false, "3"),
1498-
]));
1499-
1500-
// WHY name mapping: Resolves conflicts by mapping Iceberg field IDs to Parquet column names
1501-
let name_mapping = Arc::new(NameMapping::new(vec![
1502-
MappedField::new(Some(2), vec!["name".to_string()], vec![]),
1503-
MappedField::new(Some(3), vec!["dept".to_string()], vec![]),
1504-
MappedField::new(Some(4), vec!["subdept".to_string()], vec![]),
1505-
]));
1506-
1507-
let projected_field_ids = [1, 2, 3, 4]; // id, name, dept, subdept
1508-
1509-
let mut transformer =
1510-
RecordBatchTransformerBuilder::new(snapshot_schema, &projected_field_ids)
1511-
.with_partition(Some(partition_spec), Some(partition_data))
1512-
.build();
1513-
1514-
// Parquet data (one row)
1515-
let parquet_batch = RecordBatch::try_new(parquet_schema, vec![
1516-
Arc::new(StringArray::from(vec!["John Doe"])), // name column
1517-
Arc::new(StringArray::from(vec!["Engineering"])), // dept column
1518-
Arc::new(StringArray::from(vec!["Backend"])), // subdept column
1519-
])
1520-
.unwrap();
1521-
1522-
let result = transformer.process_record_batch(parquet_batch).unwrap();
1523-
1524-
assert_eq!(result.num_columns(), 4);
1525-
assert_eq!(result.num_rows(), 1);
1526-
1527-
// Verify: id from partition constant (conflict with Parquet field_id=1)
1528-
assert_eq!(
1529-
result
1530-
.column(0)
1531-
.as_any()
1532-
.downcast_ref::<Int32Array>()
1533-
.unwrap()
1534-
.value(0),
1535-
1
1536-
);
1537-
1538-
// Verify: name via name mapping (conflict with Parquet field_id=2)
1539-
assert_eq!(
1540-
result
1541-
.column(1)
1542-
.as_any()
1543-
.downcast_ref::<StringArray>()
1544-
.unwrap()
1545-
.value(0),
1546-
"John Doe"
1547-
);
1548-
1549-
// Verify: dept via name mapping (conflict with Parquet field_id=3)
1550-
assert_eq!(
1551-
result
1552-
.column(2)
1553-
.as_any()
1554-
.downcast_ref::<StringArray>()
1555-
.unwrap()
1556-
.value(0),
1557-
"Engineering"
1558-
);
1559-
1560-
// Verify: subdept via name mapping (not in Parquet by field ID)
1561-
assert_eq!(
1562-
result
1563-
.column(3)
1564-
.as_any()
1565-
.downcast_ref::<StringArray>()
1566-
.unwrap()
1567-
.value(0),
1568-
"Backend"
1569-
);
1570-
}
15711449
}

0 commit comments

Comments
 (0)