Skip to content

Commit 9288756

Browse files
update code
1 parent 51f205c commit 9288756

File tree

3 files changed

+89
-81
lines changed

3 files changed

+89
-81
lines changed

src/iceberg/avro/avro_schema_util.cc

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <charconv>
2121
#include <format>
2222
#include <mutex>
23+
#include <sstream>
2324
#include <string_view>
2425

2526
#include <arrow/type.h>
@@ -56,11 +57,19 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
5657
return attributes;
5758
}
5859

60+
void SanitizeChar(char c, std::ostringstream& os) {
61+
if (std::isdigit(c)) {
62+
os << '_' << c;
63+
} else {
64+
os << "_x" << std::uppercase << std::hex << static_cast<int>(c);
65+
}
66+
}
67+
5968
} // namespace
6069

61-
bool validAvroName(const std::string& name) {
70+
bool ValidAvroName(const std::string& name) {
6271
if (name.empty()) {
63-
throw std::runtime_error("Empty name");
72+
return false;
6473
}
6574

6675
char first = name[0];
@@ -77,35 +86,29 @@ bool validAvroName(const std::string& name) {
7786
return true;
7887
}
7988

80-
std::string SanitizeChar(char c) {
81-
if (std::isdigit(c)) {
82-
return std::string("_") + c;
89+
std::string SanitizeFieldName(std::string_view field_name) {
90+
if (field_name.empty()) {
91+
return "";
8392
}
84-
std::stringstream ss;
85-
ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
86-
return ss.str();
87-
}
8893

89-
std::string SanitizeFieldName(std::string_view field_name) {
90-
std::string result;
91-
result.reserve(field_name.size());
94+
std::ostringstream result;
9295

9396
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
94-
result.append(SanitizeChar(field_name[0]));
97+
SanitizeChar(field_name[0], result);
9598
} else {
96-
result.push_back(field_name[0]);
99+
result << field_name[0];
97100
}
98101

99102
for (size_t i = 1; i < field_name.size(); ++i) {
100103
char c = field_name[i];
101104
if (std::isalnum(c) || c == '_') {
102-
result.push_back(c);
105+
result << c;
103106
} else {
104-
result.append(SanitizeChar(c));
107+
SanitizeChar(c, result);
105108
}
106109
}
107110

108-
return result;
111+
return result.str();
109112
}
110113

111114
std::string ToString(const ::avro::NodePtr& node) {
@@ -231,12 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
231234
::avro::NodePtr field_node;
232235
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
233236

234-
bool isValidFieldName = validAvroName(std::string(sub_field.name()));
235-
std::string fieldName = isValidFieldName ? std::string(sub_field.name())
236-
: SanitizeFieldName(sub_field.name());
237+
std::string origFieldName = std::string(sub_field.name());
238+
bool isValidFieldName = ValidAvroName(origFieldName);
239+
std::string fieldName =
240+
isValidFieldName ? origFieldName : SanitizeFieldName(origFieldName);
241+
237242
(*node)->addName(fieldName);
238243
(*node)->addLeaf(field_node);
239-
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
244+
245+
::avro::CustomAttributes attributes = GetAttributesWithFieldId(sub_field.field_id());
246+
if (!isValidFieldName) {
247+
attributes.addAttribute(std::string(kIcebergFieldNameProp), origFieldName,
248+
/*addQuotes=*/true);
249+
}
250+
(*node)->addCustomAttributesForField(attributes);
240251
}
241252
return {};
242253
}

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,16 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type);
149149
/// \return True if the node has a map logical type, false otherwise.
150150
bool HasMapLogicalType(const ::avro::NodePtr& node);
151151

152+
/// \brief Check if a string is a valid Avro name.
153+
///
154+
/// Valid Avro names must:
155+
/// 1. Start with a letter or underscore
156+
/// 2. Contain only letters, digits, or underscores
157+
///
158+
/// \param name The name to check.
159+
/// \return True if the name is valid, false otherwise.
160+
bool ValidAvroName(const std::string& name);
161+
152162
/// \brief Create a new Avro node with field IDs from name mapping.
153163
/// \param original_node The original Avro node to copy.
154164
/// \param mapping The name mapping to apply field IDs from.
@@ -163,8 +173,7 @@ Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original
163173
/// 1. If the first character is not a letter or underscore, it is specially handled:
164174
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
165175
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
166-
/// representation
167-
/// of the character (e.g., '$' -> '_x24')
176+
/// representation of the character (e.g., '$' -> '_x24')
168177
/// 2. For characters other than the first:
169178
/// - If it's a letter, digit, or underscore, it remains unchanged
170179
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal

test/avro_schema_test.cc

Lines changed: 46 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@
3232

3333
namespace iceberg::avro {
3434

35-
// Forward declaration of functions to test
36-
bool validAvroName(const std::string& name);
37-
3835
namespace {
3936

4037
void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) {
@@ -62,29 +59,29 @@ void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
6259

6360
TEST(ValidAvroNameTest, ValidNames) {
6461
// Valid field names should return true
65-
EXPECT_TRUE(validAvroName("valid_field"));
66-
EXPECT_TRUE(validAvroName("field123"));
67-
EXPECT_TRUE(validAvroName("_private"));
68-
EXPECT_TRUE(validAvroName("CamelCase"));
69-
EXPECT_TRUE(validAvroName("field_with_underscores"));
62+
EXPECT_TRUE(ValidAvroName("valid_field"));
63+
EXPECT_TRUE(ValidAvroName("field123"));
64+
EXPECT_TRUE(ValidAvroName("_private"));
65+
EXPECT_TRUE(ValidAvroName("CamelCase"));
66+
EXPECT_TRUE(ValidAvroName("field_with_underscores"));
7067
}
7168

7269
TEST(ValidAvroNameTest, InvalidNames) {
7370
// Names starting with numbers should return false
74-
EXPECT_FALSE(validAvroName("123field"));
75-
EXPECT_FALSE(validAvroName("0value"));
71+
EXPECT_FALSE(ValidAvroName("123field"));
72+
EXPECT_FALSE(ValidAvroName("0value"));
7673

7774
// Names with special characters should return false
78-
EXPECT_FALSE(validAvroName("field-name"));
79-
EXPECT_FALSE(validAvroName("field.name"));
80-
EXPECT_FALSE(validAvroName("field name"));
81-
EXPECT_FALSE(validAvroName("field@name"));
82-
EXPECT_FALSE(validAvroName("field#name"));
75+
EXPECT_FALSE(ValidAvroName("field-name"));
76+
EXPECT_FALSE(ValidAvroName("field.name"));
77+
EXPECT_FALSE(ValidAvroName("field name"));
78+
EXPECT_FALSE(ValidAvroName("field@name"));
79+
EXPECT_FALSE(ValidAvroName("field#name"));
8380
}
8481

8582
TEST(ValidAvroNameTest, EmptyName) {
86-
// Empty name should throw an exception
87-
EXPECT_THROW(validAvroName(""), std::runtime_error);
83+
// Empty name should return false
84+
EXPECT_FALSE(ValidAvroName(""));
8885
}
8986

9087
TEST(SanitizeFieldNameTest, ValidFieldNames) {
@@ -116,7 +113,7 @@ TEST(SanitizeFieldNameTest, InvalidFieldNames) {
116113

117114
TEST(SanitizeFieldNameTest, EdgeCases) {
118115
// Empty field name
119-
EXPECT_EQ(SanitizeFieldName(""), "_x0");
116+
EXPECT_EQ(SanitizeFieldName(""), "");
120117

121118
// Field name with only special characters
122119
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
@@ -258,67 +255,58 @@ TEST(ToAvroNodeVisitorTest, StructType) {
258255
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
259256
}
260257

261-
TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
262-
// Test struct with field names that require sanitization
258+
TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) {
263259
StructType struct_type{
264260
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
265261
/*optional=*/false},
266-
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
262+
SchemaField{/*field_id=*/2, "valid_field", iceberg::string(),
263+
/*optional=*/false},
264+
SchemaField{/*field_id=*/3, "email.address", iceberg::string(),
265+
/*optional=*/true},
266+
SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(),
267267
/*optional=*/true},
268-
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
268+
SchemaField{/*field_id=*/5, "123field", iceberg::int32(),
269269
/*optional=*/false},
270-
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
270+
SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(),
271271
/*optional=*/true}}};
272-
273272
::avro::NodePtr node;
274273
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
275274
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
276275

277-
// Check that field names are sanitized
278-
ASSERT_EQ(node->names(), 4);
276+
ASSERT_EQ(node->names(), 6);
277+
279278
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
280-
EXPECT_EQ(node->nameAt(1),
279+
EXPECT_EQ(node->nameAt(2),
281280
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
282-
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
281+
EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field"
283282
EXPECT_EQ(
284-
node->nameAt(3),
283+
node->nameAt(5),
285284
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"
286285

287-
// Check that field IDs are correctly applied
288-
// Each field has 1 custom attribute: field-id
289-
ASSERT_EQ(node->customAttributes(), 4);
286+
EXPECT_EQ(node->nameAt(1), "valid_field");
287+
EXPECT_EQ(node->nameAt(3), "AnotherField");
288+
289+
ASSERT_EQ(node->customAttributes(), 6);
290290
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
291291
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
292292
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
293293
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
294-
}
295-
296-
TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
297-
// Test struct with field names that don't require sanitization
298-
StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(),
299-
/*optional=*/false},
300-
SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(),
301-
/*optional=*/true}}};
302-
303-
::avro::NodePtr node;
304-
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
305-
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
306-
307-
// Check that field names remain unchanged
308-
ASSERT_EQ(node->names(), 2);
309-
EXPECT_EQ(node->nameAt(0), "valid_field");
310-
EXPECT_EQ(node->nameAt(1), "AnotherField");
311-
312-
// Check that field IDs are correctly applied
313-
ASSERT_EQ(node->customAttributes(), 2);
314-
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
315-
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
294+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5));
295+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6));
316296

317-
// For valid field names, there should be no iceberg-field-name attributes
318-
const auto& attrs0 = node->customAttributesAt(0);
319-
const auto& attrs1 = node->customAttributesAt(1);
320-
EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value());
297+
const auto& attrs1 = node->customAttributesAt(1); // valid_field
298+
const auto& attrs3 = node->customAttributesAt(3); // AnotherField
321299
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
300+
EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value());
301+
302+
ASSERT_NO_FATAL_FAILURE(
303+
CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name"));
304+
ASSERT_NO_FATAL_FAILURE(
305+
CheckIcebergFieldName(node, /*index=*/2, /*original_name=*/"email.address"));
306+
ASSERT_NO_FATAL_FAILURE(
307+
CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field"));
308+
ASSERT_NO_FATAL_FAILURE(
309+
CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with spaces"));
322310
}
323311

324312
TEST(ToAvroNodeVisitorTest, ListType) {

0 commit comments

Comments
 (0)