Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 66 additions & 3 deletions src/iceberg/avro/avro_schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <charconv>
#include <format>
#include <mutex>
#include <sstream>
#include <string_view>

#include <arrow/type.h>
Expand Down Expand Up @@ -56,8 +57,60 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
return attributes;
}

void SanitizeChar(char c, std::ostringstream& os) {
if (std::isdigit(c)) {
os << '_' << c;
} else {
os << "_x" << std::uppercase << std::hex << static_cast<int>(c);
}
}

} // namespace

bool ValidAvroName(std::string_view name) {
if (name.empty()) {
return false;
}

char first = name[0];
if (!(std::isalpha(first) || first == '_')) {
return false;
}

for (size_t i = 1; i < name.length(); i++) {
char character = name[i];
if (!(std::isalnum(character) || character == '_')) {
return false;
}
}
return true;
}

std::string SanitizeFieldName(std::string_view field_name) {
if (field_name.empty()) {
return "";
}

std::ostringstream result;

if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
SanitizeChar(field_name[0], result);
} else {
result << field_name[0];
}

for (size_t i = 1; i < field_name.size(); ++i) {
char c = field_name[i];
if (std::isalnum(c) || c == '_') {
result << c;
} else {
SanitizeChar(c, result);
}
}

return result.str();
}

std::string ToString(const ::avro::NodePtr& node) {
std::stringstream ss;
ss << *node;
Expand Down Expand Up @@ -181,10 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
::avro::NodePtr field_node;
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));

// TODO(gangwu): sanitize field name
(*node)->addName(std::string(sub_field.name()));
bool is_valid_field_name = ValidAvroName(sub_field.name());
std::string field_name = is_valid_field_name ? std::string(sub_field.name())
: SanitizeFieldName(sub_field.name());

(*node)->addName(field_name);
(*node)->addLeaf(field_node);
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));

::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id());
if (!is_valid_field_name) {
attributes.addAttribute(std::string(kIcebergFieldNameProp),
std::string(sub_field.name()),
/*addQuotes=*/true);
}
(*node)->addCustomAttributesForField(attributes);
}
return {};
}
Expand Down
33 changes: 33 additions & 0 deletions src/iceberg/avro/avro_schema_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,44 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type);
/// \return True if the node has a map logical type, false otherwise.
bool HasMapLogicalType(const ::avro::NodePtr& node);

/// \brief Check if a string is a valid Avro name.
///
/// Valid Avro names must:
/// 1. Start with a letter or underscore
/// 2. Contain only letters, digits, or underscores
///
/// \param name The name to check.
/// \return True if the name is valid, false otherwise.
bool ValidAvroName(std::string_view name);

/// \brief Create a new Avro node with field IDs from name mapping.
/// \param original_node The original Avro node to copy.
/// \param mapping The name mapping to apply field IDs from.
/// \return A new Avro node with field IDs applied, or an error.
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
const NameMapping& mapping);

/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
///
/// Converts names that are not valid Avro names to valid Avro names.
/// Conversion rules:
/// 1. If the first character is not a letter or underscore, it is specially handled:
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation of the character (e.g., '$' -> '_x24')
/// 2. For characters other than the first:
/// - If it's a letter, digit, or underscore, it remains unchanged
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
///
/// Examples:
/// - "123field" -> "_123field"
/// - "user-name" -> "user_x2Dname"
/// - "$price" -> "_x24price"
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
///
/// \param field_name The original field name to sanitize.
/// \return A sanitized field name that follows Avro naming conventions.
std::string SanitizeFieldName(std::string_view field_name);

} // namespace iceberg::avro
129 changes: 128 additions & 1 deletion test/avro_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
}

// Helper function to check if a custom attribute exists for a field name preservation
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
const std::string& original_name) {
ASSERT_LT(index, node->customAttributes());
const auto& attrs = node->customAttributesAt(index);
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
}

} // namespace

TEST(ValidAvroNameTest, ValidNames) {
// Valid field names should return true
EXPECT_TRUE(ValidAvroName("valid_field"));
EXPECT_TRUE(ValidAvroName("field123"));
EXPECT_TRUE(ValidAvroName("_private"));
EXPECT_TRUE(ValidAvroName("CamelCase"));
EXPECT_TRUE(ValidAvroName("field_with_underscores"));
}

TEST(ValidAvroNameTest, InvalidNames) {
// Names starting with numbers should return false
EXPECT_FALSE(ValidAvroName("123field"));
EXPECT_FALSE(ValidAvroName("0value"));

// Names with special characters should return false
EXPECT_FALSE(ValidAvroName("field-name"));
EXPECT_FALSE(ValidAvroName("field.name"));
EXPECT_FALSE(ValidAvroName("field name"));
EXPECT_FALSE(ValidAvroName("field@name"));
EXPECT_FALSE(ValidAvroName("field#name"));
}

TEST(ValidAvroNameTest, EmptyName) {
// Empty name should return false
EXPECT_FALSE(ValidAvroName(""));
}

TEST(SanitizeFieldNameTest, ValidFieldNames) {
// Valid field names should remain unchanged
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
}

TEST(SanitizeFieldNameTest, InvalidFieldNames) {
// Field names starting with numbers should be prefixed with underscore
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");

// Field names with special characters should be encoded with hex values
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");

// Complex field names with multiple issues
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
"_1field_x2Dwith_x2Especial_x40chars");
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
}

TEST(SanitizeFieldNameTest, EdgeCases) {
// Empty field name
EXPECT_EQ(SanitizeFieldName(""), "");

// Field name with only special characters
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");

// Field name starting with special character
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
}

TEST(ToAvroNodeVisitorTest, BooleanType) {
::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
Expand Down Expand Up @@ -181,6 +255,60 @@ TEST(ToAvroNodeVisitorTest, StructType) {
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
}

TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) {
StructType struct_type{
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/2, "valid_field", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/3, "email.address", iceberg::string(),
/*optional=*/true},
SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(),
/*optional=*/true},
SchemaField{/*field_id=*/5, "123field", iceberg::int32(),
/*optional=*/false},
SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(),
/*optional=*/true}}};
::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);

ASSERT_EQ(node->names(), 6);

EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
EXPECT_EQ(node->nameAt(2),
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field"
EXPECT_EQ(
node->nameAt(5),
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"

EXPECT_EQ(node->nameAt(1), "valid_field");
EXPECT_EQ(node->nameAt(3), "AnotherField");

ASSERT_EQ(node->customAttributes(), 6);
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6));

const auto& attrs1 = node->customAttributesAt(1); // valid_field
const auto& attrs3 = node->customAttributesAt(3); // AnotherField
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value());

ASSERT_NO_FATAL_FAILURE(
CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name"));
ASSERT_NO_FATAL_FAILURE(
CheckIcebergFieldName(node, /*index=*/2, /*original_name=*/"email.address"));
ASSERT_NO_FATAL_FAILURE(
CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field"));
ASSERT_NO_FATAL_FAILURE(
CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with spaces"));
}

TEST(ToAvroNodeVisitorTest, ListType) {
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
/*optional=*/true}};
Expand Down Expand Up @@ -1436,5 +1564,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
}

} // namespace iceberg::avro
Loading