@@ -613,14 +613,127 @@ def test_compression_level():
613613
614614
615615def test_sanitized_spark_field_names ():
616- a0 = pa .array ([0 , 1 , 2 , 3 , 4 ])
617- name = 'prohib; ,\t {}'
618- table = pa .Table .from_arrays ([a0 ], [name ])
616+ field_metadata = {b'key' : b'value' }
617+ schema_metadata = {b'schema_key' : b'schema_value' }
618+
619+ schema = pa .schema ([
620+ pa .field ('prohib; ,\t {}' , pa .int32 ()),
621+ pa .field ('field=with\n special' , pa .string (), metadata = field_metadata ),
622+ pa .field ('nested_struct' , pa .struct ([
623+ pa .field ('field,comma' , pa .int32 ()),
624+ pa .field ('deeply{nested}' , pa .struct ([
625+ pa .field ('field(parens)' , pa .float64 ()),
626+ pa .field ('normal_field' , pa .bool_ ())
627+ ]))
628+ ]))
629+ ], metadata = schema_metadata )
630+
631+ data = [
632+ pa .array ([1 , 2 ]),
633+ pa .array (['a' , 'b' ]),
634+ pa .array ([
635+ {'field,comma' : 10 , 'deeply{nested}' : {
636+ 'field(parens)' : 1.5 , 'normal_field' : True }},
637+ {'field,comma' : 20 , 'deeply{nested}' : {
638+ 'field(parens)' : 2.5 , 'normal_field' : False }}
639+ ], type = schema [2 ].type )
640+ ]
641+
642+ table = pa .Table .from_arrays (data , schema = schema )
643+ result = _roundtrip_table (table , write_table_kwargs = {'flavor' : 'spark' })
644+
645+ assert result .schema [0 ].name == 'prohib______'
646+ assert result .schema [1 ].name == 'field_with_special'
647+
648+ nested_type = result .schema [2 ].type
649+ assert nested_type [0 ].name == 'field_comma'
650+ assert nested_type [1 ].name == 'deeply_nested_'
651+
652+ deep_type = nested_type [1 ].type
653+ assert deep_type [0 ].name == 'field_parens_'
654+ assert deep_type [1 ].name == 'normal_field'
655+
656+ assert result .schema [1 ].metadata == field_metadata
657+ assert result .schema .metadata == schema_metadata
658+ assert len (result ) == 2
659+
660+
661+ def test_sanitized_spark_field_names_nested ():
662+ # Test that field name sanitization works for structs nested inside
663+ # lists, maps, and other complex types
664+ schema = pa .schema ([
665+ # List containing struct with special chars
666+ pa .field ('list;field' , pa .list_ (pa .field ('item' , pa .struct ([
667+ pa .field ('field,name' , pa .int32 ()),
668+ pa .field ('other{field}' , pa .string ())
669+ ])))),
670+ # Large list with nested struct
671+ pa .field ('large=list' , pa .large_list (pa .field ('element' , pa .struct ([
672+ pa .field ('nested(field)' , pa .float64 ())
673+ ])))),
674+ # Fixed size list with nested struct
675+ pa .field ('fixed\t list' , pa .list_ (pa .field ('item' , pa .struct ([
676+ pa .field ('special field' , pa .int32 ())
677+ ])), 2 )),
678+ # Map with structs in both key and value
679+ pa .field ('map field' , pa .map_ (
680+ pa .field ('key' , pa .struct (
681+ [pa .field ('key;field' , pa .string ())]), nullable = False ),
682+ pa .field ('value' , pa .struct ([pa .field ('value,field' , pa .int32 ())]))
683+ ))
684+ ])
685+
686+ list_data = pa .array ([
687+ [{'field,name' : 1 , 'other{field}' : 'a' }],
688+ [{'field,name' : 2 , 'other{field}' : 'b' }]
689+ ], type = schema [0 ].type )
690+
691+ large_list_data = pa .array ([
692+ [{'nested(field)' : 1.5 }],
693+ [{'nested(field)' : 2.5 }]
694+ ], type = schema [1 ].type )
695+
696+ fixed_list_data = pa .array ([
697+ [{'special field' : 10 }, {'special field' : 20 }],
698+ [{'special field' : 30 }, {'special field' : 40 }]
699+ ], type = schema [2 ].type )
700+
701+ map_data = pa .array ([
702+ [({'key;field' : 'k1' }, {'value,field' : 100 })],
703+ [({'key;field' : 'k2' }, {'value,field' : 200 })]
704+ ], type = schema [3 ].type )
705+
706+ table = pa .Table .from_arrays (
707+ [list_data , large_list_data , fixed_list_data , map_data ],
708+ schema = schema
709+ )
619710
620711 result = _roundtrip_table (table , write_table_kwargs = {'flavor' : 'spark' })
621712
622- expected_name = 'prohib______'
623- assert result .schema [0 ].name == expected_name
713+ # Check top-level field names are sanitized
714+ assert result .schema [0 ].name == 'list_field'
715+ assert result .schema [1 ].name == 'large_list'
716+ assert result .schema [2 ].name == 'fixed_list'
717+ assert result .schema [3 ].name == 'map_field'
718+
719+ # Check list value field's struct has sanitized names
720+ list_value_type = result .schema [0 ].type .value_type
721+ assert list_value_type [0 ].name == 'field_name'
722+ assert list_value_type [1 ].name == 'other_field_'
723+
724+ # Check large list value field's struct has sanitized names
725+ large_list_value_type = result .schema [1 ].type .value_type
726+ assert large_list_value_type [0 ].name == 'nested_field_'
727+
728+ # Check fixed size list value field's struct has sanitized names
729+ fixed_list_value_type = result .schema [2 ].type .value_type
730+ assert fixed_list_value_type [0 ].name == 'special_field'
731+
732+ # Check map key and item structs have sanitized names
733+ map_key_type = result .schema [3 ].type .key_type
734+ map_item_type = result .schema [3 ].type .item_type
735+ assert map_key_type [0 ].name == 'key_field'
736+ assert map_item_type [0 ].name == 'value_field'
624737
625738
626739@pytest .mark .pandas
0 commit comments