Skip to content

Commit feb2918

Browse files
committed
feat(arrow): Use field name for lookup when field_id in parquet metadata is unavailable
When reading Arrow data from sources that don't provide the PARQUET:field_id metadata (like DataFusion), the column lookup failed. This change introduces a fallback mechanism to look up fields by name if the field ID is not present in the Arrow field metadata. This improves compatibility with various Arrow data sources. The commit also includes: - A new unit test to verify the name-based fallback logic. - A more detailed error message when a field can't be found.
1 parent 195fb73 commit feb2918

File tree

1 file changed

+82
-4
lines changed

1 file changed

+82
-4
lines changed

crates/iceberg/src/arrow/value.rs

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -459,14 +459,27 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
459459
.fields()
460460
.iter()
461461
.position(|arrow_field| {
462-
get_field_id(arrow_field)
463-
.map(|id| id == field.id)
464-
.unwrap_or(false)
462+
if let Ok(arrow_field_id) = get_field_id(arrow_field) {
463+
arrow_field_id == field.id
464+
} else {
465+
// Fallback to name matching (for DataFusion compatibility)
466+
arrow_field.name() == &field.name
467+
}
465468
})
466469
.ok_or_else(|| {
467470
Error::new(
468471
ErrorKind::DataInvalid,
469-
format!("Field id {} not found in struct array", field.id),
472+
format!(
473+
"Field with name '{}' (id: {}) not found in struct array. Available fields: [{}]",
474+
field.name,
475+
field.id,
476+
struct_array
477+
.fields()
478+
.iter()
479+
.map(|f| f.name().as_str())
480+
.collect::<Vec<_>>()
481+
.join(", ")
482+
),
470483
)
471484
})?;
472485

@@ -1226,4 +1239,69 @@ mod test {
12261239
]))),
12271240
]);
12281241
}
1242+
1243+
#[test]
1244+
fn test_field_partner_with_datafusion_schema() {
1245+
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
1246+
1247+
use crate::spec::{NestedField, PrimitiveType, Type};
1248+
1249+
let id_field = "id";
1250+
let score_field = "score";
1251+
1252+
// Create an Arrow schema with id and PARQUET:field_id metadata
1253+
// And score without PARQUET:field_id metadata (like DataFusion)
1254+
let arrow_schema = ArrowSchema::new(vec![
1255+
ArrowField::new(id_field, DataType::Int64, false).with_metadata(HashMap::from([(
1256+
PARQUET_FIELD_ID_META_KEY.to_string(),
1257+
"1".to_string(),
1258+
)])),
1259+
ArrowField::new(score_field, DataType::Float64, true),
1260+
]);
1261+
1262+
// Create test data
1263+
let id_array = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef;
1264+
let score_array =
1265+
Arc::new(Float64Array::from(vec![Some(95.5), Some(87.2), None])) as ArrayRef;
1266+
1267+
let struct_array = Arc::new(StructArray::new(
1268+
arrow_schema.fields().clone(),
1269+
vec![id_array, score_array],
1270+
None,
1271+
)) as ArrayRef;
1272+
1273+
// Create corresponding Iceberg nested fields
1274+
let id_field = NestedField {
1275+
id: 1,
1276+
name: id_field.to_string(),
1277+
required: true,
1278+
field_type: Box::new(Type::Primitive(PrimitiveType::Long)),
1279+
doc: None,
1280+
initial_default: None,
1281+
write_default: None,
1282+
};
1283+
1284+
let score_field = NestedField {
1285+
id: 2,
1286+
name: score_field.to_string(),
1287+
required: false,
1288+
field_type: Box::new(Type::Primitive(PrimitiveType::Double)),
1289+
doc: None,
1290+
initial_default: None,
1291+
write_default: None,
1292+
};
1293+
1294+
let accessor = ArrowArrayAccessor;
1295+
1296+
// Test field matching by name, it should be ok because id has PARQUET:field_id metadata
1297+
let id_partner = accessor.field_partner(&struct_array, &id_field).unwrap();
1298+
assert_eq!(id_partner.len(), 3);
1299+
assert_eq!(id_partner.data_type(), &DataType::Int64);
1300+
1301+
// Test field matching by name, it should be ok because score doesn't have PARQUET:field_id metadata
1302+
// But it should fall back to name
1303+
let score_partner = accessor.field_partner(&struct_array, &score_field).unwrap();
1304+
assert_eq!(score_partner.len(), 3);
1305+
assert_eq!(score_partner.data_type(), &DataType::Float64);
1306+
}
12291307
}

0 commit comments

Comments
 (0)