Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions src/iceberg/avro/avro_schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {

} // namespace

bool validAvroName(const std::string& name) {
if (name.empty()) {
throw std::runtime_error("Empty name");
}

char first = name[0];
if (!(std::isalpha(first) || first == '_')) {
return false;
}

for (size_t i = 1; i < name.length(); i++) {
char character = name[i];
if (!(std::isalnum(character) || character == '_')) {
return false;
}
}
return true;
}

std::string SanitizeChar(char c) {
if (std::isdigit(c)) {
return std::string("_") + c;
}
std::stringstream ss;
ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
return ss.str();
}

std::string SanitizeFieldName(std::string_view field_name) {
std::string result;
result.reserve(field_name.size());

if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
result.append(SanitizeChar(field_name[0]));
} else {
result.push_back(field_name[0]);
}

for (size_t i = 1; i < field_name.size(); ++i) {
char c = field_name[i];
if (std::isalnum(c) || c == '_') {
result.push_back(c);
} else {
result.append(SanitizeChar(c));
}
}

return result;
}

std::string ToString(const ::avro::NodePtr& node) {
std::stringstream ss;
ss << *node;
Expand Down Expand Up @@ -181,8 +231,10 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
::avro::NodePtr field_node;
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));

// TODO(gangwu): sanitize field name
(*node)->addName(std::string(sub_field.name()));
bool isValidFieldName = validAvroName(std::string(sub_field.name()));
std::string fieldName = isValidFieldName ? std::string(sub_field.name())
: SanitizeFieldName(sub_field.name());
(*node)->addName(fieldName);
(*node)->addLeaf(field_node);
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
}
Expand Down
24 changes: 24 additions & 0 deletions src/iceberg/avro/avro_schema_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,28 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
const NameMapping& mapping);

/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
///
/// Converts names that are not valid Avro names to valid Avro names.
/// Conversion rules:
/// 1. If the first character is not a letter or underscore, it is specially handled:
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
/// of the character (e.g., '$' -> '_x24')
/// 2. For characters other than the first:
/// - If it's a letter, digit, or underscore, it remains unchanged
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
///
/// Examples:
/// - "123field" -> "_123field"
/// - "user-name" -> "user_x2Dname"
/// - "$price" -> "_x24price"
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
///
/// \param field_name The original field name to sanitize.
/// \return A sanitized field name that follows Avro naming conventions.
std::string SanitizeFieldName(std::string_view field_name);

} // namespace iceberg::avro
141 changes: 140 additions & 1 deletion test/avro_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

namespace iceberg::avro {

// Forward declaration of functions to test
bool validAvroName(const std::string& name);

namespace {

void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) {
Expand All @@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
}

// Helper function to check if a custom attribute exists for a field name preservation
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
const std::string& original_name) {
ASSERT_LT(index, node->customAttributes());
const auto& attrs = node->customAttributesAt(index);
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
}

} // namespace

TEST(ValidAvroNameTest, ValidNames) {
// Valid field names should return true
EXPECT_TRUE(validAvroName("valid_field"));
EXPECT_TRUE(validAvroName("field123"));
EXPECT_TRUE(validAvroName("_private"));
EXPECT_TRUE(validAvroName("CamelCase"));
EXPECT_TRUE(validAvroName("field_with_underscores"));
}

TEST(ValidAvroNameTest, InvalidNames) {
// Names starting with numbers should return false
EXPECT_FALSE(validAvroName("123field"));
EXPECT_FALSE(validAvroName("0value"));

// Names with special characters should return false
EXPECT_FALSE(validAvroName("field-name"));
EXPECT_FALSE(validAvroName("field.name"));
EXPECT_FALSE(validAvroName("field name"));
EXPECT_FALSE(validAvroName("field@name"));
EXPECT_FALSE(validAvroName("field#name"));
}

TEST(ValidAvroNameTest, EmptyName) {
// Empty name should throw an exception
EXPECT_THROW(validAvroName(""), std::runtime_error);
}

TEST(SanitizeFieldNameTest, ValidFieldNames) {
// Valid field names should remain unchanged
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
}

TEST(SanitizeFieldNameTest, InvalidFieldNames) {
// Field names starting with numbers should be prefixed with underscore
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");

// Field names with special characters should be encoded with hex values
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");

// Complex field names with multiple issues
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
"_1field_x2Dwith_x2Especial_x40chars");
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
}

TEST(SanitizeFieldNameTest, EdgeCases) {
// Empty field name
EXPECT_EQ(SanitizeFieldName(""), "_x0");

// Field name with only special characters
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");

// Field name starting with special character
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
}

TEST(ToAvroNodeVisitorTest, BooleanType) {
::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
Expand Down Expand Up @@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
}

TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
// Test struct with field names that require sanitization
StructType struct_type{
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
/*optional=*/true},
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
/*optional=*/false},
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
/*optional=*/true}}};

::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);

// Check that field names are sanitized
ASSERT_EQ(node->names(), 4);
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
EXPECT_EQ(node->nameAt(1),
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
EXPECT_EQ(
node->nameAt(3),
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"

// Check that field IDs are correctly applied
// Each field has 1 custom attribute: field-id
ASSERT_EQ(node->customAttributes(), 4);
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
}

TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
// Test struct with field names that don't require sanitization
StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(),
/*optional=*/true}}};

::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);

// Check that field names remain unchanged
ASSERT_EQ(node->names(), 2);
EXPECT_EQ(node->nameAt(0), "valid_field");
EXPECT_EQ(node->nameAt(1), "AnotherField");

// Check that field IDs are correctly applied
ASSERT_EQ(node->customAttributes(), 2);
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));

// For valid field names, there should be no iceberg-field-name attributes
const auto& attrs0 = node->customAttributesAt(0);
const auto& attrs1 = node->customAttributesAt(1);
EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value());
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
}

TEST(ToAvroNodeVisitorTest, ListType) {
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
/*optional=*/true}};
Expand Down Expand Up @@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
}

} // namespace iceberg::avro
Loading