@@ -63,6 +63,33 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363 return attributes;
6464}
6565
66+ std::string SanitizeFieldName (std::string_view field_name) {
67+ if (field_name.empty ()) {
68+ return " _empty" ;
69+ }
70+
71+ std::string result;
72+ result.reserve (field_name.size ());
73+
74+ // First character must be a letter or underscore
75+ if (!std::isalpha (field_name[0 ]) && field_name[0 ] != ' _' ) {
76+ result.push_back (' _' );
77+ } else {
78+ result.push_back (field_name[0 ]);
79+ }
80+
81+ // Rest of characters must be letters, digits, or underscores
82+ for (size_t i = 1 ; i < field_name.size (); ++i) {
83+ char c = field_name[i];
84+ if (std::isalnum (c) || c == ' _' ) {
85+ result.push_back (c);
86+ } else {
87+ result.push_back (' _' );
88+ }
89+ }
90+ return result;
91+ }
92+
6693} // namespace
6794
6895std::string ToString (const ::avro::NodePtr& node) {
@@ -188,8 +215,17 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
188215 ::avro::NodePtr field_node;
189216 ICEBERG_RETURN_UNEXPECTED (Visit (sub_field, &field_node));
190217
191- // TODO(gangwu): sanitize field name
192- (*node)->addName (std::string (sub_field.name ()));
218+ // Sanitize field name to ensure it follows Avro field name requirements
219+ std::string sanitized_name = SanitizeFieldName (sub_field.name ());
220+ // Store original name as a custom attribute if it was modified
221+ if (sanitized_name != sub_field.name ()) {
222+ // Add custom attribute to preserve the original field name
223+ ::avro::CustomAttributes attrs;
224+ attrs.addAttribute (std::string (kIcebergFieldNameProp ),
225+ std::string (sub_field.name ()));
226+ (*node)->addCustomAttributesForField (attrs);
227+ }
228+ (*node)->addName (sanitized_name);
193229 (*node)->addLeaf (field_node);
194230 (*node)->addCustomAttributesForField (GetAttributesWithFieldId (sub_field.field_id ()));
195231 }
@@ -839,7 +875,15 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig
839875 // Recursively apply field IDs to nested fields
840876 ICEBERG_ASSIGN_OR_RAISE (auto new_nested_node,
841877 MakeAvroNodeWithFieldIds (field_node, *nested_field));
842- new_record_node->addName (field_name);
878+ std::string sanitized_name = SanitizeFieldName (field_name);
879+ // Store original name as a custom attribute if it was modified
880+ if (sanitized_name != field_name) {
881+ // Add custom attribute to preserve the original field name
882+ ::avro::CustomAttributes attrs;
883+ attrs.addAttribute (std::string (kIcebergFieldNameProp ), field_name);
884+ new_record_node->addCustomAttributesForField (attrs);
885+ }
886+ new_record_node->addName (sanitized_name);
843887 new_record_node->addLeaf (new_nested_node);
844888 }
845889
0 commit comments