3232
3333namespace iceberg ::avro {
3434
35+ // Forward declaration of functions to test
36+ bool validAvroName (const std::string& name);
37+
3538namespace {
3639
3740void CheckCustomLogicalType (const ::avro::NodePtr& node, const std::string& type_name) {
@@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
4750 ASSERT_EQ (attrs.getAttribute (key), std::make_optional (std::to_string (field_id)));
4851}
4952
53+ // Helper function to check if a custom attribute exists for a field name preservation
54+ void CheckIcebergFieldName (const ::avro::NodePtr& node, size_t index,
55+ const std::string& original_name) {
56+ ASSERT_LT (index, node->customAttributes ());
57+ const auto & attrs = node->customAttributesAt (index);
58+ ASSERT_EQ (attrs.getAttribute (" iceberg-field-name" ), std::make_optional (original_name));
59+ }
60+
5061} // namespace
5162
63+ TEST (ValidAvroNameTest, ValidNames) {
64+ // Valid field names should return true
65+ EXPECT_TRUE (validAvroName (" valid_field" ));
66+ EXPECT_TRUE (validAvroName (" field123" ));
67+ EXPECT_TRUE (validAvroName (" _private" ));
68+ EXPECT_TRUE (validAvroName (" CamelCase" ));
69+ EXPECT_TRUE (validAvroName (" field_with_underscores" ));
70+ }
71+
72+ TEST (ValidAvroNameTest, InvalidNames) {
73+ // Names starting with numbers should return false
74+ EXPECT_FALSE (validAvroName (" 123field" ));
75+ EXPECT_FALSE (validAvroName (" 0value" ));
76+
77+ // Names with special characters should return false
78+ EXPECT_FALSE (validAvroName (" field-name" ));
79+ EXPECT_FALSE (validAvroName (" field.name" ));
80+ EXPECT_FALSE (validAvroName (" field name" ));
81+ EXPECT_FALSE (validAvroName (" field@name" ));
82+ EXPECT_FALSE (validAvroName (" field#name" ));
83+ }
84+
85+ TEST (ValidAvroNameTest, EmptyName) {
86+ // Empty name should throw an exception
87+ EXPECT_THROW (validAvroName (" " ), std::runtime_error);
88+ }
89+
90+ TEST (SanitizeFieldNameTest, ValidFieldNames) {
91+ // Valid field names should remain unchanged
92+ EXPECT_EQ (SanitizeFieldName (" valid_field" ), " valid_field" );
93+ EXPECT_EQ (SanitizeFieldName (" field123" ), " field123" );
94+ EXPECT_EQ (SanitizeFieldName (" _private" ), " _private" );
95+ EXPECT_EQ (SanitizeFieldName (" CamelCase" ), " CamelCase" );
96+ EXPECT_EQ (SanitizeFieldName (" field_with_underscores" ), " field_with_underscores" );
97+ }
98+
99+ TEST (SanitizeFieldNameTest, InvalidFieldNames) {
100+ // Field names starting with numbers should be prefixed with underscore
101+ EXPECT_EQ (SanitizeFieldName (" 123field" ), " _123field" );
102+ EXPECT_EQ (SanitizeFieldName (" 0value" ), " _0value" );
103+
104+ // Field names with special characters should be encoded with hex values
105+ EXPECT_EQ (SanitizeFieldName (" field-name" ), " field_x2Dname" );
106+ EXPECT_EQ (SanitizeFieldName (" field.name" ), " field_x2Ename" );
107+ EXPECT_EQ (SanitizeFieldName (" field name" ), " field_x20name" );
108+ EXPECT_EQ (SanitizeFieldName (" field@name" ), " field_x40name" );
109+ EXPECT_EQ (SanitizeFieldName (" field#name" ), " field_x23name" );
110+
111+ // Complex field names with multiple issues
112+ EXPECT_EQ (SanitizeFieldName (" 1field-with.special@chars" ),
113+ " _1field_x2Dwith_x2Especial_x40chars" );
114+ EXPECT_EQ (SanitizeFieldName (" user-email" ), " user_x2Demail" );
115+ }
116+
117+ TEST (SanitizeFieldNameTest, EdgeCases) {
118+ // Empty field name
119+ EXPECT_EQ (SanitizeFieldName (" " ), " _x0" );
120+
121+ // Field name with only special characters
122+ EXPECT_EQ (SanitizeFieldName (" @#$" ), " _x40_x23_x24" );
123+
124+ // Field name starting with special character
125+ EXPECT_EQ (SanitizeFieldName (" -field" ), " _x2Dfield" );
126+ EXPECT_EQ (SanitizeFieldName (" .field" ), " _x2Efield" );
127+ }
128+
52129TEST (ToAvroNodeVisitorTest, BooleanType) {
53130 ::avro::NodePtr node;
54131 EXPECT_THAT (ToAvroNodeVisitor{}.Visit (BooleanType{}, &node), IsOk ());
@@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181258 EXPECT_EQ (node->leafAt (1 )->leafAt (1 )->type (), ::avro::AVRO_INT);
182259}
183260
261+ TEST (ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
262+ // Test struct with field names that require sanitization
263+ StructType struct_type{
264+ {SchemaField{/* field_id=*/ 1 , " user-name" , iceberg::string (),
265+ /* optional=*/ false },
266+ SchemaField{/* field_id=*/ 2 , " email.address" , iceberg::string (),
267+ /* optional=*/ true },
268+ SchemaField{/* field_id=*/ 3 , " 123field" , iceberg::int32 (),
269+ /* optional=*/ false },
270+ SchemaField{/* field_id=*/ 4 , " field with spaces" , iceberg::boolean (),
271+ /* optional=*/ true }}};
272+
273+ ::avro::NodePtr node;
274+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
275+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD);
276+
277+ // Check that field names are sanitized
278+ ASSERT_EQ (node->names (), 4 );
279+ EXPECT_EQ (node->nameAt (0 ), " user_x2Dname" ); // "user-name" -> "user_x2Dname"
280+ EXPECT_EQ (node->nameAt (1 ),
281+ " email_x2Eaddress" ); // "email.address" -> "email_x2Eaddress"
282+ EXPECT_EQ (node->nameAt (2 ), " _123field" ); // "123field" -> "_123field"
283+ EXPECT_EQ (
284+ node->nameAt (3 ),
285+ " field_x20with_x20spaces" ); // "field with spaces" -> "field_x20with_x20spaces"
286+
287+ // Check that field IDs are correctly applied
288+ // Each field has 1 custom attribute: field-id
289+ ASSERT_EQ (node->customAttributes (), 4 );
290+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
291+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
292+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 2 , /* field_id=*/ 3 ));
293+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 3 , /* field_id=*/ 4 ));
294+ }
295+
296+ TEST (ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
297+ // Test struct with field names that don't require sanitization
298+ StructType struct_type{{SchemaField{/* field_id=*/ 1 , " valid_field" , iceberg::string (),
299+ /* optional=*/ false },
300+ SchemaField{/* field_id=*/ 2 , " AnotherField" , iceberg::int32 (),
301+ /* optional=*/ true }}};
302+
303+ ::avro::NodePtr node;
304+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
305+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD);
306+
307+ // Check that field names remain unchanged
308+ ASSERT_EQ (node->names (), 2 );
309+ EXPECT_EQ (node->nameAt (0 ), " valid_field" );
310+ EXPECT_EQ (node->nameAt (1 ), " AnotherField" );
311+
312+ // Check that field IDs are correctly applied
313+ ASSERT_EQ (node->customAttributes (), 2 );
314+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
315+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
316+
317+ // For valid field names, there should be no iceberg-field-name attributes
318+ const auto & attrs0 = node->customAttributesAt (0 );
319+ const auto & attrs1 = node->customAttributesAt (1 );
320+ EXPECT_FALSE (attrs0.getAttribute (" iceberg-field-name" ).has_value ());
321+ EXPECT_FALSE (attrs1.getAttribute (" iceberg-field-name" ).has_value ());
322+ }
323+
184324TEST (ToAvroNodeVisitorTest, ListType) {
185325 ListType list_type{SchemaField{/* field_id=*/ 5 , " element" , iceberg::string (),
186326 /* optional=*/ true }};
@@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
14361576 auto result = MakeAvroNodeWithFieldIds (avro_schema.root (), *name_mapping);
14371577 ASSERT_THAT (result, IsError (ErrorKind::kInvalidSchema ));
14381578}
1439-
14401579} // namespace iceberg::avro
0 commit comments