Skip to content

Commit 6f614f7

Browse files
committed
feat(arrow): Use field name for lookup when field_id in parquet metadata is unavailable
When reading Arrow data from sources that don't provide the PARQUET:field_id metadata (like DataFusion), the column lookup failed. This change introduces a fallback mechanism to look up fields by name if the field ID is not present in the Arrow field metadata. This improves compatibility with various Arrow data sources. The commit also includes: - A new unit test to verify the name-based fallback logic. - A more detailed error message when a field can't be found.
1 parent 195fb73 commit 6f614f7

File tree

1 file changed

+77
-3
lines changed

1 file changed

+77
-3
lines changed

crates/iceberg/src/arrow/value.rs

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -460,13 +460,22 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
460460
.iter()
461461
.position(|arrow_field| {
462462
get_field_id(arrow_field)
463-
.map(|id| id == field.id)
464-
.unwrap_or(false)
463+
.map_or(arrow_field.name() == &field.name, |id| id == field.id)
465464
})
466465
.ok_or_else(|| {
467466
Error::new(
468467
ErrorKind::DataInvalid,
469-
format!("Field id {} not found in struct array", field.id),
468+
format!(
469+
"Field with name '{}' (id: {}) not found in struct array. Available fields: [{}]",
470+
field.name,
471+
field.id,
472+
struct_array
473+
.fields()
474+
.iter()
475+
.map(|f| f.name().as_str())
476+
.collect::<Vec<_>>()
477+
.join(", ")
478+
),
470479
)
471480
})?;
472481

@@ -1226,4 +1235,69 @@ mod test {
12261235
]))),
12271236
]);
12281237
}
1238+
1239+
#[test]
1240+
fn test_field_partner_with_datafusion_schema() {
1241+
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
1242+
1243+
use crate::spec::{NestedField, PrimitiveType, Type};
1244+
1245+
let id_field = "id";
1246+
let score_field = "score";
1247+
1248+
// Create an Arrow schema with id and PARQUET:field_id metadata
1249+
// And score without PARQUET:field_id metadata (like DataFusion)
1250+
let arrow_schema = ArrowSchema::new(vec![
1251+
ArrowField::new(id_field, DataType::Int64, false).with_metadata(HashMap::from([(
1252+
PARQUET_FIELD_ID_META_KEY.to_string(),
1253+
"1".to_string(),
1254+
)])),
1255+
ArrowField::new(score_field, DataType::Float64, true),
1256+
]);
1257+
1258+
// Create test data
1259+
let id_array = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef;
1260+
let score_array =
1261+
Arc::new(Float64Array::from(vec![Some(95.5), Some(87.2), None])) as ArrayRef;
1262+
1263+
let struct_array = Arc::new(StructArray::new(
1264+
arrow_schema.fields().clone(),
1265+
vec![id_array, score_array],
1266+
None,
1267+
)) as ArrayRef;
1268+
1269+
// Create corresponding Iceberg nested fields
1270+
let id_field = NestedField {
1271+
id: 1,
1272+
name: id_field.to_string(),
1273+
required: true,
1274+
field_type: Box::new(Type::Primitive(PrimitiveType::Long)),
1275+
doc: None,
1276+
initial_default: None,
1277+
write_default: None,
1278+
};
1279+
1280+
let score_field = NestedField {
1281+
id: 2,
1282+
name: score_field.to_string(),
1283+
required: false,
1284+
field_type: Box::new(Type::Primitive(PrimitiveType::Double)),
1285+
doc: None,
1286+
initial_default: None,
1287+
write_default: None,
1288+
};
1289+
1290+
let accessor = ArrowArrayAccessor;
1291+
1292+
// Test field matching by name, it should be ok because id has PARQUET:field_id metadata
1293+
let id_partner = accessor.field_partner(&struct_array, &id_field).unwrap();
1294+
assert_eq!(id_partner.len(), 3);
1295+
assert_eq!(id_partner.data_type(), &DataType::Int64);
1296+
1297+
// Test field matching by name, it should be ok because score doesn't have PARQUET:field_id metadata
1298+
// But it should fall back to name
1299+
let score_partner = accessor.field_partner(&struct_array, &score_field).unwrap();
1300+
assert_eq!(score_partner.len(), 3);
1301+
assert_eq!(score_partner.data_type(), &DataType::Float64);
1302+
}
12291303
}

0 commit comments

Comments
 (0)