Skip to content

Commit 80a0822

Browse files
feat: avro schema add sanitize field name
1 parent 82a1cd6 commit 80a0822

File tree

1 file changed

+47
-3
lines changed

1 file changed

+47
-3
lines changed

src/iceberg/avro/avro_schema_util.cc

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,33 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363
return attributes;
6464
}
6565

66+
std::string SanitizeFieldName(std::string_view field_name) {
67+
if (field_name.empty()) {
68+
return "_empty";
69+
}
70+
71+
std::string result;
72+
result.reserve(field_name.size());
73+
74+
// First character must be a letter or underscore
75+
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
76+
result.push_back('_');
77+
} else {
78+
result.push_back(field_name[0]);
79+
}
80+
81+
// Rest of characters must be letters, digits, or underscores
82+
for (size_t i = 1; i < field_name.size(); ++i) {
83+
char c = field_name[i];
84+
if (std::isalnum(c) || c == '_') {
85+
result.push_back(c);
86+
} else {
87+
result.push_back('_');
88+
}
89+
}
90+
return result;
91+
}
92+
6693
} // namespace
6794

6895
std::string ToString(const ::avro::NodePtr& node) {
@@ -188,8 +215,17 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
188215
::avro::NodePtr field_node;
189216
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
190217

191-
// TODO(gangwu): sanitize field name
192-
(*node)->addName(std::string(sub_field.name()));
218+
// Sanitize field name to ensure it follows Avro field name requirements
219+
std::string sanitized_name = SanitizeFieldName(sub_field.name());
220+
// Store original name as a custom attribute if it was modified
221+
if (sanitized_name != sub_field.name()) {
222+
// Add custom attribute to preserve the original field name
223+
::avro::CustomAttributes attrs;
224+
attrs.addAttribute(std::string(kIcebergFieldNameProp),
225+
std::string(sub_field.name()));
226+
(*node)->addCustomAttributesForField(attrs);
227+
}
228+
(*node)->addName(sanitized_name);
193229
(*node)->addLeaf(field_node);
194230
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
195231
}
@@ -839,7 +875,15 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig
839875
// Recursively apply field IDs to nested fields
840876
ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node,
841877
MakeAvroNodeWithFieldIds(field_node, *nested_field));
842-
new_record_node->addName(field_name);
878+
std::string sanitized_name = SanitizeFieldName(field_name);
879+
// Store original name as a custom attribute if it was modified
880+
if (sanitized_name != field_name) {
881+
// Add custom attribute to preserve the original field name
882+
::avro::CustomAttributes attrs;
883+
attrs.addAttribute(std::string(kIcebergFieldNameProp), field_name);
884+
new_record_node->addCustomAttributesForField(attrs);
885+
}
886+
new_record_node->addName(sanitized_name);
843887
new_record_node->addLeaf(new_nested_node);
844888
}
845889

0 commit comments

Comments
 (0)