From 51f205ce8184ea8c50947da3f268d75e2542897b Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 22 Aug 2025 14:42:30 +0000 Subject: [PATCH 1/3] feat: avro schema add sanitize field name --- src/iceberg/avro/avro_schema_util.cc | 56 +++++++- src/iceberg/avro/avro_schema_util_internal.h | 24 ++++ test/avro_schema_test.cc | 141 ++++++++++++++++++- 3 files changed, 218 insertions(+), 3 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index d411b4ecb..e48a7f32a 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) { } // namespace +bool validAvroName(const std::string& name) { + if (name.empty()) { + throw std::runtime_error("Empty name"); + } + + char first = name[0]; + if (!(std::isalpha(first) || first == '_')) { + return false; + } + + for (size_t i = 1; i < name.length(); i++) { + char character = name[i]; + if (!(std::isalnum(character) || character == '_')) { + return false; + } + } + return true; +} + +std::string SanitizeChar(char c) { + if (std::isdigit(c)) { + return std::string("_") + c; + } + std::stringstream ss; + ss << "_x" << std::uppercase << std::hex << static_cast(c); + return ss.str(); +} + +std::string SanitizeFieldName(std::string_view field_name) { + std::string result; + result.reserve(field_name.size()); + + if (!std::isalpha(field_name[0]) && field_name[0] != '_') { + result.append(SanitizeChar(field_name[0])); + } else { + result.push_back(field_name[0]); + } + + for (size_t i = 1; i < field_name.size(); ++i) { + char c = field_name[i]; + if (std::isalnum(c) || c == '_') { + result.push_back(c); + } else { + result.append(SanitizeChar(c)); + } + } + + return result; +} + std::string ToString(const ::avro::NodePtr& node) { std::stringstream ss; ss << *node; @@ -181,8 +231,10 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { ::avro::NodePtr field_node; ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node)); - // TODO(gangwu): sanitize field name - (*node)->addName(std::string(sub_field.name())); + bool isValidFieldName = validAvroName(std::string(sub_field.name())); + std::string fieldName = isValidFieldName ? std::string(sub_field.name()) + : SanitizeFieldName(sub_field.name()); + (*node)->addName(fieldName); (*node)->addLeaf(field_node); (*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id())); } diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 16478703a..08db3c7c5 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -156,4 +156,28 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, const NameMapping& mapping); +/// \brief Sanitize a field name to make it compatible with Avro field name requirements. +/// +/// Converts names that are not valid Avro names to valid Avro names. +/// Conversion rules: +/// 1. If the first character is not a letter or underscore, it is specially handled: +/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3') +/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal +/// representation +/// of the character (e.g., '$' -> '_x24') +/// 2. For characters other than the first: +/// - If it's a letter, digit, or underscore, it remains unchanged +/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal +/// representation +/// +/// Examples: +/// - "123field" -> "_123field" +/// - "user-name" -> "user_x2Dname" +/// - "$price" -> "_x24price" +/// - "valid_name_123" -> "valid_name_123" (no conversion needed) +/// +/// \param field_name The original field name to sanitize. +/// \return A sanitized field name that follows Avro naming conventions. +std::string SanitizeFieldName(std::string_view field_name); + } // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 6fbbb851b..dc5cada0c 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -32,6 +32,9 @@ namespace iceberg::avro { +// Forward declaration of functions to test +bool validAvroName(const std::string& name); + namespace { void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) { @@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); } +// Helper function to check if a custom attribute exists for a field name preservation +void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index, + const std::string& original_name) { + ASSERT_LT(index, node->customAttributes()); + const auto& attrs = node->customAttributesAt(index); + ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name)); +} + } // namespace +TEST(ValidAvroNameTest, ValidNames) { + // Valid field names should return true + EXPECT_TRUE(validAvroName("valid_field")); + EXPECT_TRUE(validAvroName("field123")); + EXPECT_TRUE(validAvroName("_private")); + EXPECT_TRUE(validAvroName("CamelCase")); + EXPECT_TRUE(validAvroName("field_with_underscores")); +} + +TEST(ValidAvroNameTest, InvalidNames) { + // Names starting with numbers should return false + EXPECT_FALSE(validAvroName("123field")); + EXPECT_FALSE(validAvroName("0value")); + + // Names with special characters should return false + EXPECT_FALSE(validAvroName("field-name")); + EXPECT_FALSE(validAvroName("field.name")); + EXPECT_FALSE(validAvroName("field name")); + EXPECT_FALSE(validAvroName("field@name")); + EXPECT_FALSE(validAvroName("field#name")); +} + +TEST(ValidAvroNameTest, EmptyName) { + // Empty name should throw an exception + EXPECT_THROW(validAvroName(""), std::runtime_error); +} + +TEST(SanitizeFieldNameTest, ValidFieldNames) { + // Valid field names should remain unchanged + EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field"); + EXPECT_EQ(SanitizeFieldName("field123"), "field123"); + EXPECT_EQ(SanitizeFieldName("_private"), "_private"); + EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase"); + EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores"); +} + +TEST(SanitizeFieldNameTest, InvalidFieldNames) { + // Field names starting with numbers should be prefixed with underscore + EXPECT_EQ(SanitizeFieldName("123field"), "_123field"); + EXPECT_EQ(SanitizeFieldName("0value"), "_0value"); + + // Field names with special characters should be encoded with hex values + EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname"); + EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename"); + EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name"); + EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name"); + EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name"); + + // Complex field names with multiple issues + EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"), + "_1field_x2Dwith_x2Especial_x40chars"); + EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail"); +} + +TEST(SanitizeFieldNameTest, EdgeCases) { + // Empty field name + EXPECT_EQ(SanitizeFieldName(""), "_x0"); + + // Field name with only special characters + EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24"); + + // Field name starting with special character + EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield"); + EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield"); +} + TEST(ToAvroNodeVisitorTest, BooleanType) { ::avro::NodePtr node; EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk()); @@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) { EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT); } +TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) { + // Test struct with field names that require sanitization + StructType struct_type{ + {SchemaField{/*field_id=*/1, "user-name", iceberg::string(), + /*optional=*/false}, + SchemaField{/*field_id=*/2, "email.address", iceberg::string(), + /*optional=*/true}, + SchemaField{/*field_id=*/3, "123field", iceberg::int32(), + /*optional=*/false}, + SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(), + /*optional=*/true}}}; + + ::avro::NodePtr node; + EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk()); + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + + // Check that field names are sanitized + ASSERT_EQ(node->names(), 4); + EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname" + EXPECT_EQ(node->nameAt(1), + "email_x2Eaddress"); // "email.address" -> "email_x2Eaddress" + EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field" + EXPECT_EQ( + node->nameAt(3), + "field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces" + + // Check that field IDs are correctly applied + // Each field has 1 custom attribute: field-id + ASSERT_EQ(node->customAttributes(), 4); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4)); +} + +TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) { + // Test struct with field names that don't require sanitization + StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(), + /*optional=*/false}, + SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(), + /*optional=*/true}}}; + + ::avro::NodePtr node; + EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk()); + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + + // Check that field names remain unchanged + ASSERT_EQ(node->names(), 2); + EXPECT_EQ(node->nameAt(0), "valid_field"); + EXPECT_EQ(node->nameAt(1), "AnotherField"); + + // Check that field IDs are correctly applied + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + + // For valid field names, there should be no iceberg-field-name attributes + const auto& attrs0 = node->customAttributesAt(0); + const auto& attrs1 = node->customAttributesAt(1); + EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value()); + EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value()); +} + TEST(ToAvroNodeVisitorTest, ListType) { ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(), /*optional=*/true}}; @@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); } - } // namespace iceberg::avro From 9288756f8507fdc4b5db1a3c90a27d61e05cd8b7 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Mon, 8 Sep 2025 22:15:24 +0800 Subject: [PATCH 2/3] update code --- src/iceberg/avro/avro_schema_util.cc | 53 ++++++---- src/iceberg/avro/avro_schema_util_internal.h | 13 ++- test/avro_schema_test.cc | 104 ++++++++----------- 3 files changed, 89 insertions(+), 81 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index e48a7f32a..87ecc2d4e 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -56,11 +57,19 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) { return attributes; } +void SanitizeChar(char c, std::ostringstream& os) { + if (std::isdigit(c)) { + os << '_' << c; + } else { + os << "_x" << std::uppercase << std::hex << static_cast(c); + } +} + } // namespace -bool validAvroName(const std::string& name) { +bool ValidAvroName(const std::string& name) { if (name.empty()) { - throw std::runtime_error("Empty name"); + return false; } char first = name[0]; @@ -77,35 +86,29 @@ bool validAvroName(const std::string& name) { return true; } -std::string SanitizeChar(char c) { - if (std::isdigit(c)) { - return std::string("_") + c; +std::string SanitizeFieldName(std::string_view field_name) { + if (field_name.empty()) { + return ""; } - std::stringstream ss; - ss << "_x" << std::uppercase << std::hex << static_cast(c); - return ss.str(); -} -std::string SanitizeFieldName(std::string_view field_name) { - std::string result; - result.reserve(field_name.size()); + std::ostringstream result; if (!std::isalpha(field_name[0]) && field_name[0] != '_') { - result.append(SanitizeChar(field_name[0])); + SanitizeChar(field_name[0], result); } else { - result.push_back(field_name[0]); + result << field_name[0]; } for (size_t i = 1; i < field_name.size(); ++i) { char c = field_name[i]; if (std::isalnum(c) || c == '_') { - result.push_back(c); + result << c; } else { - result.append(SanitizeChar(c)); + SanitizeChar(c, result); } } - return result; + return result.str(); } std::string ToString(const ::avro::NodePtr& node) { @@ -231,12 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { ::avro::NodePtr field_node; ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node)); - bool isValidFieldName = validAvroName(std::string(sub_field.name())); - std::string fieldName = isValidFieldName ? std::string(sub_field.name()) - : SanitizeFieldName(sub_field.name()); + std::string origFieldName = std::string(sub_field.name()); + bool isValidFieldName = ValidAvroName(origFieldName); + std::string fieldName = + isValidFieldName ? origFieldName : SanitizeFieldName(origFieldName); + (*node)->addName(fieldName); (*node)->addLeaf(field_node); - (*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id())); + + ::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id()); + if (!isValidFieldName) { + attributes.addAttribute(std::string(kIcebergFieldNameProp), origFieldName, + /*addQuotes=*/true); + } + (*node)->addCustomAttributesForField(attributes); } return {}; } diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 08db3c7c5..a4a59873c 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -149,6 +149,16 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type); /// \return True if the node has a map logical type, false otherwise. bool HasMapLogicalType(const ::avro::NodePtr& node); +/// \brief Check if a string is a valid Avro name. +/// +/// Valid Avro names must: +/// 1. Start with a letter or underscore +/// 2. Contain only letters, digits, or underscores +/// +/// \param name The name to check. +/// \return True if the name is valid, false otherwise. +bool ValidAvroName(const std::string& name); + /// \brief Create a new Avro node with field IDs from name mapping. /// \param original_node The original Avro node to copy. /// \param mapping The name mapping to apply field IDs from. @@ -163,8 +173,7 @@ Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original /// 1. If the first character is not a letter or underscore, it is specially handled: /// - Digits: Prefixed with an underscore (e.g., '3' -> '_3') /// - Other characters: Converted to '_x' followed by the uppercase hexadecimal -/// representation -/// of the character (e.g., '$' -> '_x24') +/// representation of the character (e.g., '$' -> '_x24') /// 2. For characters other than the first: /// - If it's a letter, digit, or underscore, it remains unchanged /// - Other characters: Converted to '_x' followed by the uppercase hexadecimal diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index dc5cada0c..6870ee253 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -32,9 +32,6 @@ namespace iceberg::avro { -// Forward declaration of functions to test -bool validAvroName(const std::string& name); - namespace { void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) { @@ -62,29 +59,29 @@ void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index, TEST(ValidAvroNameTest, ValidNames) { // Valid field names should return true - EXPECT_TRUE(validAvroName("valid_field")); - EXPECT_TRUE(validAvroName("field123")); - EXPECT_TRUE(validAvroName("_private")); - EXPECT_TRUE(validAvroName("CamelCase")); - EXPECT_TRUE(validAvroName("field_with_underscores")); + EXPECT_TRUE(ValidAvroName("valid_field")); + EXPECT_TRUE(ValidAvroName("field123")); + EXPECT_TRUE(ValidAvroName("_private")); + EXPECT_TRUE(ValidAvroName("CamelCase")); + EXPECT_TRUE(ValidAvroName("field_with_underscores")); } TEST(ValidAvroNameTest, InvalidNames) { // Names starting with numbers should return false - EXPECT_FALSE(validAvroName("123field")); - EXPECT_FALSE(validAvroName("0value")); + EXPECT_FALSE(ValidAvroName("123field")); + EXPECT_FALSE(ValidAvroName("0value")); // Names with special characters should return false - EXPECT_FALSE(validAvroName("field-name")); - EXPECT_FALSE(validAvroName("field.name")); - EXPECT_FALSE(validAvroName("field name")); - EXPECT_FALSE(validAvroName("field@name")); - EXPECT_FALSE(validAvroName("field#name")); + EXPECT_FALSE(ValidAvroName("field-name")); + EXPECT_FALSE(ValidAvroName("field.name")); + EXPECT_FALSE(ValidAvroName("field name")); + EXPECT_FALSE(ValidAvroName("field@name")); + EXPECT_FALSE(ValidAvroName("field#name")); } TEST(ValidAvroNameTest, EmptyName) { - // Empty name should throw an exception - EXPECT_THROW(validAvroName(""), std::runtime_error); + // Empty name should return false + EXPECT_FALSE(ValidAvroName("")); } TEST(SanitizeFieldNameTest, ValidFieldNames) { @@ -116,7 +113,7 @@ TEST(SanitizeFieldNameTest, InvalidFieldNames) { TEST(SanitizeFieldNameTest, EdgeCases) { // Empty field name - EXPECT_EQ(SanitizeFieldName(""), "_x0"); + EXPECT_EQ(SanitizeFieldName(""), ""); // Field name with only special characters EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24"); @@ -258,67 +255,58 @@ TEST(ToAvroNodeVisitorTest, StructType) { EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT); } -TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) { - // Test struct with field names that require sanitization +TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) { StructType struct_type{ {SchemaField{/*field_id=*/1, "user-name", iceberg::string(), /*optional=*/false}, - SchemaField{/*field_id=*/2, "email.address", iceberg::string(), + SchemaField{/*field_id=*/2, "valid_field", iceberg::string(), + /*optional=*/false}, + SchemaField{/*field_id=*/3, "email.address", iceberg::string(), + /*optional=*/true}, + SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(), /*optional=*/true}, - SchemaField{/*field_id=*/3, "123field", iceberg::int32(), + SchemaField{/*field_id=*/5, "123field", iceberg::int32(), /*optional=*/false}, - SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(), + SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(), /*optional=*/true}}}; - ::avro::NodePtr node; EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk()); EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); - // Check that field names are sanitized - ASSERT_EQ(node->names(), 4); + ASSERT_EQ(node->names(), 6); + EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname" - EXPECT_EQ(node->nameAt(1), + EXPECT_EQ(node->nameAt(2), "email_x2Eaddress"); // "email.address" -> "email_x2Eaddress" - EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field" + EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field" EXPECT_EQ( - node->nameAt(3), + node->nameAt(5), "field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces" - // Check that field IDs are correctly applied - // Each field has 1 custom attribute: field-id - ASSERT_EQ(node->customAttributes(), 4); + EXPECT_EQ(node->nameAt(1), "valid_field"); + EXPECT_EQ(node->nameAt(3), "AnotherField"); + + ASSERT_EQ(node->customAttributes(), 6); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4)); -} - -TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) { - // Test struct with field names that don't require sanitization - StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(), - /*optional=*/false}, - SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(), - /*optional=*/true}}}; - - ::avro::NodePtr node; - EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk()); - EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); - - // Check that field names remain unchanged - ASSERT_EQ(node->names(), 2); - EXPECT_EQ(node->nameAt(0), "valid_field"); - EXPECT_EQ(node->nameAt(1), "AnotherField"); - - // Check that field IDs are correctly applied - ASSERT_EQ(node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6)); - // For valid field names, there should be no iceberg-field-name attributes - const auto& attrs0 = node->customAttributesAt(0); - const auto& attrs1 = node->customAttributesAt(1); - EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value()); + const auto& attrs1 = node->customAttributesAt(1); // valid_field + const auto& attrs3 = node->customAttributesAt(3); // AnotherField EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value()); + EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value()); + + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/2, /*original_name=*/"email.address")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field")); + ASSERT_NO_FATAL_FAILURE( + CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with spaces")); } TEST(ToAvroNodeVisitorTest, ListType) { From 03ac83e5be51a1eedbdb7bdde53b4cecff5e359c Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Tue, 16 Sep 2025 22:25:22 +0800 Subject: [PATCH 3/3] update code --- src/iceberg/avro/avro_schema_util.cc | 16 ++++++++-------- src/iceberg/avro/avro_schema_util_internal.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 87ecc2d4e..490b6ad69 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -67,7 +67,7 @@ void SanitizeChar(char c, std::ostringstream& os) { } // namespace -bool ValidAvroName(const std::string& name) { +bool ValidAvroName(std::string_view name) { if (name.empty()) { return false; } @@ -234,17 +234,17 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { ::avro::NodePtr field_node; ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node)); - std::string origFieldName = std::string(sub_field.name()); - bool isValidFieldName = ValidAvroName(origFieldName); - std::string fieldName = - isValidFieldName ? origFieldName : SanitizeFieldName(origFieldName); + bool is_valid_field_name = ValidAvroName(sub_field.name()); + std::string field_name = is_valid_field_name ? std::string(sub_field.name()) + : SanitizeFieldName(sub_field.name()); - (*node)->addName(fieldName); + (*node)->addName(field_name); (*node)->addLeaf(field_node); ::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id()); - if (!isValidFieldName) { - attributes.addAttribute(std::string(kIcebergFieldNameProp), origFieldName, + if (!is_valid_field_name) { + attributes.addAttribute(std::string(kIcebergFieldNameProp), + std::string(sub_field.name()), /*addQuotes=*/true); } (*node)->addCustomAttributesForField(attributes); diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index a4a59873c..bdfbf135a 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -157,7 +157,7 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); /// /// \param name The name to check. /// \return True if the name is valid, false otherwise. -bool ValidAvroName(const std::string& name); +bool ValidAvroName(std::string_view name); /// \brief Create a new Avro node with field IDs from name mapping. /// \param original_node The original Avro node to copy.