diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index d411b4ecb..490b6ad69 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -56,8 +57,60 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) { return attributes; } +void SanitizeChar(char c, std::ostringstream& os) { + if (std::isdigit(c)) { + os << '_' << c; + } else { + os << "_x" << std::uppercase << std::hex << static_cast(c); + } +} + } // namespace +bool ValidAvroName(std::string_view name) { + if (name.empty()) { + return false; + } + + char first = name[0]; + if (!(std::isalpha(first) || first == '_')) { + return false; + } + + for (size_t i = 1; i < name.length(); i++) { + char character = name[i]; + if (!(std::isalnum(character) || character == '_')) { + return false; + } + } + return true; +} + +std::string SanitizeFieldName(std::string_view field_name) { + if (field_name.empty()) { + return ""; + } + + std::ostringstream result; + + if (!std::isalpha(field_name[0]) && field_name[0] != '_') { + SanitizeChar(field_name[0], result); + } else { + result << field_name[0]; + } + + for (size_t i = 1; i < field_name.size(); ++i) { + char c = field_name[i]; + if (std::isalnum(c) || c == '_') { + result << c; + } else { + SanitizeChar(c, result); + } + } + + return result.str(); +} + std::string ToString(const ::avro::NodePtr& node) { std::stringstream ss; ss << *node; @@ -181,10 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { ::avro::NodePtr field_node; ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node)); - // TODO(gangwu): sanitize field name - (*node)->addName(std::string(sub_field.name())); + bool is_valid_field_name = ValidAvroName(sub_field.name()); + std::string field_name = is_valid_field_name ? std::string(sub_field.name()) + : SanitizeFieldName(sub_field.name()); + + (*node)->addName(field_name); (*node)->addLeaf(field_node); - (*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id())); + + ::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id()); + if (!is_valid_field_name) { + attributes.addAttribute(std::string(kIcebergFieldNameProp), + std::string(sub_field.name()), + /*addQuotes=*/true); + } + (*node)->addCustomAttributesForField(attributes); } return {}; } diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 16478703a..bdfbf135a 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -149,6 +149,16 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type); /// \return True if the node has a map logical type, false otherwise. bool HasMapLogicalType(const ::avro::NodePtr& node); +/// \brief Check if a string is a valid Avro name. +/// +/// Valid Avro names must: +/// 1. Start with a letter or underscore +/// 2. Contain only letters, digits, or underscores +/// +/// \param name The name to check. +/// \return True if the name is valid, false otherwise. +bool ValidAvroName(std::string_view name); + /// \brief Create a new Avro node with field IDs from name mapping. /// \param original_node The original Avro node to copy. /// \param mapping The name mapping to apply field IDs from. @@ -156,4 +166,27 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, const NameMapping& mapping); +/// \brief Sanitize a field name to make it compatible with Avro field name requirements. +/// +/// Converts names that are not valid Avro names to valid Avro names. +/// Conversion rules: +/// 1. If the first character is not a letter or underscore, it is specially handled: +/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3') +/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal +/// representation of the character (e.g., '$' -> '_x24') +/// 2. For characters other than the first: +/// - If it's a letter, digit, or underscore, it remains unchanged +/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal +/// representation +/// +/// Examples: +/// - "123field" -> "_123field" +/// - "user-name" -> "user_x2Dname" +/// - "$price" -> "_x24price" +/// - "valid_name_123" -> "valid_name_123" (no conversion needed) +/// +/// \param field_name The original field name to sanitize. +/// \return A sanitized field name that follows Avro naming conventions. +std::string SanitizeFieldName(std::string_view field_name); + } // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 6fbbb851b..6870ee253 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -47,8 +47,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); } +// Helper function to check if a custom attribute exists for a field name preservation +void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index, + const std::string& original_name) { + ASSERT_LT(index, node->customAttributes()); + const auto& attrs = node->customAttributesAt(index); + ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name)); +} + } // namespace +TEST(ValidAvroNameTest, ValidNames) { + // Valid field names should return true + EXPECT_TRUE(ValidAvroName("valid_field")); + EXPECT_TRUE(ValidAvroName("field123")); + EXPECT_TRUE(ValidAvroName("_private")); + EXPECT_TRUE(ValidAvroName("CamelCase")); + EXPECT_TRUE(ValidAvroName("field_with_underscores")); +} + +TEST(ValidAvroNameTest, InvalidNames) { + // Names starting with numbers should return false + EXPECT_FALSE(ValidAvroName("123field")); + EXPECT_FALSE(ValidAvroName("0value")); + + // Names with special characters should return false + EXPECT_FALSE(ValidAvroName("field-name")); + EXPECT_FALSE(ValidAvroName("field.name")); + EXPECT_FALSE(ValidAvroName("field name")); + EXPECT_FALSE(ValidAvroName("field@name")); + EXPECT_FALSE(ValidAvroName("field#name")); +} + +TEST(ValidAvroNameTest, EmptyName) { + // Empty name should return false + EXPECT_FALSE(ValidAvroName("")); +} + +TEST(SanitizeFieldNameTest, ValidFieldNames) { + // Valid field names should remain unchanged + EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field"); + EXPECT_EQ(SanitizeFieldName("field123"), "field123"); + EXPECT_EQ(SanitizeFieldName("_private"), "_private"); + EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase"); + EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores"); +} + +TEST(SanitizeFieldNameTest, InvalidFieldNames) { + // Field names starting with numbers should be prefixed with underscore + EXPECT_EQ(SanitizeFieldName("123field"), "_123field"); + EXPECT_EQ(SanitizeFieldName("0value"), "_0value"); + + // Field names with special characters should be encoded with hex values + EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname"); + EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename"); + EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name"); + EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name"); + EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name"); + + // Complex field names with multiple issues + EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"), + "_1field_x2Dwith_x2Especial_x40chars"); + EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail"); +} + +TEST(SanitizeFieldNameTest, EdgeCases) { + // Empty field name + EXPECT_EQ(SanitizeFieldName(""), ""); + + // Field name with only special characters + EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24"); + + // Field name starting with special character + EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield"); + EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield"); +} + TEST(ToAvroNodeVisitorTest, BooleanType) { ::avro::NodePtr node; EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk()); @@ -181,6 +255,60 @@ TEST(ToAvroNodeVisitorTest, StructType) { EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT); } +TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) { + StructType struct_type{ + {SchemaField{/*field_id=*/1, "user-name", iceberg::string(), + /*optional=*/false}, + SchemaField{/*field_id=*/2, "valid_field", iceberg::string(), + /*optional=*/false}, + SchemaField{/*field_id=*/3, "email.address", iceberg::string(), + /*optional=*/true}, + SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(), + /*optional=*/true}, + SchemaField{/*field_id=*/5, "123field", iceberg::int32(), + /*optional=*/false}, + SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(), + /*optional=*/true}}}; + ::avro::NodePtr node; + EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk()); + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + + ASSERT_EQ(node->names(), 6); + + EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname" + EXPECT_EQ(node->nameAt(2), + "email_x2Eaddress"); // "email.address" -> "email_x2Eaddress" + EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field" + EXPECT_EQ( + node->nameAt(5), + "field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces" + + EXPECT_EQ(node->nameAt(1), "valid_field"); + EXPECT_EQ(node->nameAt(3), "AnotherField"); + + ASSERT_EQ(node->customAttributes(), 6); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6)); + + const auto& attrs1 = node->customAttributesAt(1); // valid_field + const auto& attrs3 = node->customAttributesAt(3); // AnotherField + EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value()); + EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value()); + + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/2, /*original_name=*/"email.address")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with spaces")); +} + TEST(ToAvroNodeVisitorTest, ListType) { ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(), /*optional=*/true}}; @@ -1436,5 +1564,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); } - } // namespace iceberg::avro