3333#include < nlohmann/json.hpp>
3434
3535#include " iceberg/arrow/arrow_fs_file_io.h"
36+ #include " iceberg/avro/avro_constants.h"
3637#include " iceberg/avro/avro_data_util_internal.h"
3738#include " iceberg/avro/avro_schema_util_internal.h"
3839#include " iceberg/avro/avro_stream_internal.h"
39- #include " iceberg/json_internal.h"
4040#include " iceberg/name_mapping.h"
4141#include " iceberg/schema_internal.h"
4242#include " iceberg/util/checked_cast.h"
@@ -102,14 +102,9 @@ class AvroBatchReader::Impl {
102102
103103 if (has_id_visitor.HasNoIds ()) {
104104 // Apply field IDs based on name mapping if available
105- auto name_mapping_iter = options.properties .find (" name_mapping" );
106- if (name_mapping_iter != options.properties .end ()) {
107- // Parse name mapping from JSON string
108- ICEBERG_ASSIGN_OR_RAISE (auto name_mapping,
109- iceberg::NameMappingFromJson (
110- nlohmann::json::parse (name_mapping_iter->second )));
111- ICEBERG_RETURN_UNEXPECTED (
112- ApplyFieldIdsFromNameMapping (file_schema.root (), *name_mapping));
105+ if (options.name_mapping ) {
106+ ICEBERG_RETURN_UNEXPECTED (ApplyFieldIdsFromNameMapping (*options.name_mapping ,
107+ file_schema.root ().get ()));
113108 } else {
114109 return NotImplemented (
115110 " Avro file schema has no field IDs and no name mapping provided" );
@@ -210,8 +205,8 @@ class AvroBatchReader::Impl {
210205 }
211206
212207 // Apply field IDs to Avro schema nodes based on name mapping
213- Status ApplyFieldIdsFromNameMapping (const ::avro::NodePtr& node ,
214- const NameMapping& name_mapping ) {
208+ Status ApplyFieldIdsFromNameMapping (const NameMapping& name_mapping ,
209+ ::avro::Node* node ) {
215210 switch (node->type ()) {
216211 case ::avro::AVRO_RECORD:
217212 return ApplyFieldIdsToRecord (node, name_mapping);
@@ -238,19 +233,19 @@ class AvroBatchReader::Impl {
238233 }
239234 }
240235
241- Status ApplyFieldIdsToRecord (const ::avro::NodePtr& node,
242- const NameMapping& name_mapping) {
236+ Status ApplyFieldIdsToRecord (::avro::Node* node, const NameMapping& name_mapping) {
243237 for (size_t i = 0 ; i < node->leaves (); ++i) {
244238 const std::string& field_name = node->nameAt (i);
245- ::avro::NodePtr field_node = node->leafAt (i);
239+ ::avro::Node* field_node = node->leafAt (i). get ( );
246240
247241 // Try to find field ID by name in the name mapping
248242 if (auto field_ref = name_mapping.Find (field_name)) {
249243 if (field_ref->get ().field_id .has_value ()) {
250244 // Add field ID attribute to the node
251245 ::avro::CustomAttributes attributes;
252- attributes.addAttribute (
253- " field-id" , std::to_string (field_ref->get ().field_id .value ()), false );
246+ attributes.addAttribute (std::string (kFieldId ),
247+ std::to_string (field_ref->get ().field_id .value ()),
248+ false );
254249 node->addCustomAttributesForField (attributes);
255250 }
256251
@@ -262,92 +257,101 @@ class AvroBatchReader::Impl {
262257 std::vector<MappedField> fields_vector (fields_span.begin (), fields_span.end ());
263258 auto nested_name_mapping = NameMapping::Make (std::move (fields_vector));
264259 ICEBERG_RETURN_UNEXPECTED (
265- ApplyFieldIdsFromNameMapping (field_node, *nested_name_mapping));
260+ ApplyFieldIdsFromNameMapping (*nested_name_mapping, field_node ));
266261 } else {
267262 // Recursively apply field IDs to child nodes (only if not already handled by
268263 // nested mapping)
269264 ICEBERG_RETURN_UNEXPECTED (
270- ApplyFieldIdsFromNameMapping (field_node, name_mapping ));
265+ ApplyFieldIdsFromNameMapping (name_mapping, field_node ));
271266 }
272267 } else {
273268 // Recursively apply field IDs to child nodes even if no mapping found
274- ICEBERG_RETURN_UNEXPECTED (ApplyFieldIdsFromNameMapping (field_node, name_mapping ));
269+ ICEBERG_RETURN_UNEXPECTED (ApplyFieldIdsFromNameMapping (name_mapping, field_node ));
275270 }
276271 }
277272 return {};
278273 }
279274
280- Status ApplyFieldIdsToArray (const ::avro::NodePtr& node,
281- const NameMapping& name_mapping) {
275+ Status ApplyFieldIdsToArray (::avro::Node* node, const NameMapping& name_mapping) {
276+ // TODO(liuxiaoyu): Add debug logging to print node information for troubleshooting
277+ // when array type validation fails
282278 if (node->leaves () != 1 ) {
283279 return InvalidSchema (" Array type must have exactly one leaf" );
284280 }
285281
286282 // Check if this is a map represented as array
287283 if (node->logicalType ().type () == ::avro::LogicalType::CUSTOM &&
288284 node->logicalType ().customLogicalType () != nullptr &&
289- node->logicalType ().customLogicalType ()->name () == " map " ) {
290- return ApplyFieldIdsFromNameMapping (node->leafAt (0 ), name_mapping );
285+ node->logicalType ().customLogicalType ()->name () == kMapLogicalType ) {
286+ return ApplyFieldIdsFromNameMapping (name_mapping, node->leafAt (0 ). get () );
291287 }
292288
293289 // For regular arrays, try to find element field ID
294- if (auto element_field = name_mapping.Find (" element " )) {
290+ if (auto element_field = name_mapping.Find (std::string ( kElement ) )) {
295291 if (element_field->get ().field_id .has_value ()) {
296292 ::avro::CustomAttributes attributes;
297- attributes.addAttribute (
298- " element-id" , std::to_string (element_field->get ().field_id .value ()), false );
293+ attributes.addAttribute (std::string (kElementId ),
294+ std::to_string (element_field->get ().field_id .value ()),
295+ false );
299296 node->addCustomAttributesForField (attributes);
300297 }
301298 }
302299
303- return ApplyFieldIdsFromNameMapping (node->leafAt (0 ), name_mapping );
300+ return ApplyFieldIdsFromNameMapping (name_mapping, node->leafAt (0 ). get () );
304301 }
305302
306- Status ApplyFieldIdsToMap (const ::avro::NodePtr& node,
307- const NameMapping& name_mapping) {
303+ Status ApplyFieldIdsToMap (::avro::Node* node, const NameMapping& name_mapping) {
308304 if (node->leaves () != 2 ) {
309305 return InvalidSchema (" Map type must have exactly two leaves" );
310306 }
311307
312308 // Try to find key and value field IDs
313- if (auto key_field = name_mapping.Find (" key " )) {
309+ if (auto key_field = name_mapping.Find (std::string ( kKey ) )) {
314310 if (key_field->get ().field_id .has_value ()) {
315311 ::avro::CustomAttributes attributes;
316- attributes.addAttribute (" key-id " ,
312+ attributes.addAttribute (std::string ( kKeyId ) ,
317313 std::to_string (key_field->get ().field_id .value ()), false );
318314 node->addCustomAttributesForField (attributes);
319315 }
320316 }
321317
322- if (auto value_field = name_mapping.Find (" value " )) {
318+ if (auto value_field = name_mapping.Find (std::string ( kValue ) )) {
323319 if (value_field->get ().field_id .has_value ()) {
324320 ::avro::CustomAttributes attributes;
325- attributes.addAttribute (
326- " value-id" , std::to_string (value_field->get ().field_id .value ()), false );
321+ attributes.addAttribute (std::string (kValueId ),
322+ std::to_string (value_field->get ().field_id .value ()),
323+ false );
327324 node->addCustomAttributesForField (attributes);
328325 }
329326 }
330327
331- return ApplyFieldIdsFromNameMapping (node->leafAt (1 ), name_mapping );
328+ return ApplyFieldIdsFromNameMapping (name_mapping, node->leafAt (1 ). get () );
332329 }
333330
334- Status ApplyFieldIdsToUnion (const ::avro::NodePtr& node,
335- const NameMapping& name_mapping) {
331+ Status ApplyFieldIdsToUnion (::avro::Node* node, const NameMapping& name_mapping) {
336332 if (node->leaves () != 2 ) {
337333 return InvalidSchema (" Union type must have exactly two branches" );
338334 }
339335
340336 const auto & branch_0 = node->leafAt (0 );
341337 const auto & branch_1 = node->leafAt (1 );
342338
343- if (branch_0->type () == ::avro::AVRO_NULL) {
344- return ApplyFieldIdsFromNameMapping (branch_1, name_mapping);
339+ bool branch_0_is_null = (branch_0->type () == ::avro::AVRO_NULL);
340+ bool branch_1_is_null = (branch_1->type () == ::avro::AVRO_NULL);
341+
342+ if (branch_0_is_null && !branch_1_is_null) {
343+ // branch_0 is null, branch_1 is not null
344+ return ApplyFieldIdsFromNameMapping (name_mapping, branch_1.get ());
345+ } else if (!branch_0_is_null && branch_1_is_null) {
346+ // branch_0 is not null, branch_1 is null
347+ return ApplyFieldIdsFromNameMapping (name_mapping, branch_0.get ());
348+ } else if (branch_0_is_null && branch_1_is_null) {
349+ // Both branches are null - this is invalid
350+ return InvalidSchema (" Union type cannot have two null branches" );
351+ } else {
352+ // Neither branch is null - this is invalid
353+ return InvalidSchema (" Union type must have exactly one null branch" );
345354 }
346- if (branch_1->type () == ::avro::AVRO_NULL) {
347- return ApplyFieldIdsFromNameMapping (branch_0, name_mapping);
348- }
349-
350- return InvalidSchema (" Union type must have exactly one null branch" );
351355 }
352356
353357 private:
0 commit comments