1818import uuid
1919from datetime import date , datetime , timedelta , timezone
2020from decimal import Decimal
21- from typing import Any , List
21+ from typing import Any , Callable , List , Optional
2222
2323import pytest
2424from pyspark .sql import SparkSession
7070 NestedField (field_id = 12 , name = "fixed_field" , field_type = FixedType (16 ), required = False ),
7171 NestedField (field_id = 13 , name = "decimal_field" , field_type = DecimalType (5 , 2 ), required = False ),
7272 NestedField (field_id = 14 , name = "uuid_field" , field_type = UUIDType (), required = False ),
73+ NestedField (field_id = 15 , name = "special#string+field" , field_type = StringType (), required = False ),
7374)
7475
7576
7677identifier = "default.test_table"
7778
7879
7980@pytest .mark .parametrize (
80- "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification" ,
81+ "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name " ,
8182 [
8283 # # Identity Transform
8384 (
9899 VALUES
99100 (false, 'Boolean field set to false');
100101 """ ,
102+ None ,
101103 ),
102104 (
103105 [PartitionField (source_id = 2 , field_id = 1001 , transform = IdentityTransform (), name = "string_field" )],
117119 VALUES
118120 ('sample_string', 'Another string value')
119121 """ ,
122+ None ,
120123 ),
121124 (
122125 [PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int_field" )],
136139 VALUES
137140 (42, 'Associated string value for int 42')
138141 """ ,
142+ None ,
139143 ),
140144 (
141145 [PartitionField (source_id = 5 , field_id = 1001 , transform = IdentityTransform (), name = "long_field" )],
155159 VALUES
156160 (1234567890123456789, 'Associated string value for long 1234567890123456789')
157161 """ ,
162+ None ,
158163 ),
159164 (
160165 [PartitionField (source_id = 6 , field_id = 1001 , transform = IdentityTransform (), name = "float_field" )],
178183 # VALUES
179184 # (3.14, 'Associated string value for float 3.14')
180185 # """
186+ None ,
181187 ),
182188 (
183189 [PartitionField (source_id = 7 , field_id = 1001 , transform = IdentityTransform (), name = "double_field" )],
201207 # VALUES
202208 # (6.282, 'Associated string value for double 6.282')
203209 # """
210+ None ,
204211 ),
205212 (
206213 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
220227 VALUES
221228 (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
222229 """ ,
230+ None ,
223231 ),
224232 (
225233 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
239247 VALUES
240248 (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
241249 """ ,
250+ None ,
242251 ),
243252 (
244253 [PartitionField (source_id = 8 , field_id = 1001 , transform = IdentityTransform (), name = "timestamp_field" )],
263272 # VALUES
264273 # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
265274 # """
275+ None ,
266276 ),
267277 (
268278 [PartitionField (source_id = 9 , field_id = 1001 , transform = IdentityTransform (), name = "timestamptz_field" )],
287297 # VALUES
288298 # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
289299 # """
300+ None ,
290301 ),
291302 (
292303 [PartitionField (source_id = 10 , field_id = 1001 , transform = IdentityTransform (), name = "date_field" )],
306317 VALUES
307318 (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
308319 """ ,
320+ None ,
309321 ),
310322 (
311323 [PartitionField (source_id = 14 , field_id = 1001 , transform = IdentityTransform (), name = "uuid_field" )],
325337 VALUES
326338 ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
327339 """ ,
340+ None ,
328341 ),
329342 (
330343 [PartitionField (source_id = 11 , field_id = 1001 , transform = IdentityTransform (), name = "binary_field" )],
344357 VALUES
345358 (CAST('example' AS BINARY), 'Associated string value for binary `example`')
346359 """ ,
360+ None ,
347361 ),
348362 (
349363 [PartitionField (source_id = 13 , field_id = 1001 , transform = IdentityTransform (), name = "decimal_field" )],
363377 VALUES
364378 (123.45, 'Associated string value for decimal 123.45')
365379 """ ,
380+ None ,
366381 ),
367382 # # Year Month Day Hour Transform
368383 # Month Transform
384399 VALUES
385400 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
386401 """ ,
402+ None ,
387403 ),
388404 (
389405 [PartitionField (source_id = 9 , field_id = 1001 , transform = MonthTransform (), name = "timestamptz_field_month" )],
403419 VALUES
404420 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
405421 """ ,
422+ None ,
406423 ),
407424 (
408425 [PartitionField (source_id = 10 , field_id = 1001 , transform = MonthTransform (), name = "date_field_month" )],
422439 VALUES
423440 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
424441 """ ,
442+ None ,
425443 ),
426444 # Year Transform
427445 (
442460 VALUES
443461 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
444462 """ ,
463+ None ,
445464 ),
446465 (
447466 [PartitionField (source_id = 9 , field_id = 1001 , transform = YearTransform (), name = "timestamptz_field_year" )],
461480 VALUES
462481 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
463482 """ ,
483+ None ,
464484 ),
465485 (
466486 [PartitionField (source_id = 10 , field_id = 1001 , transform = YearTransform (), name = "date_field_year" )],
480500 VALUES
481501 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
482502 """ ,
503+ None ,
483504 ),
484505 # # Day Transform
485506 (
500521 VALUES
501522 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
502523 """ ,
524+ None ,
503525 ),
504526 (
505527 [PartitionField (source_id = 9 , field_id = 1001 , transform = DayTransform (), name = "timestamptz_field_day" )],
519541 VALUES
520542 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
521543 """ ,
544+ None ,
522545 ),
523546 (
524547 [PartitionField (source_id = 10 , field_id = 1001 , transform = DayTransform (), name = "date_field_day" )],
538561 VALUES
539562 (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
540563 """ ,
564+ None ,
541565 ),
542566 # Hour Transform
543567 (
558582 VALUES
559583 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
560584 """ ,
585+ None ,
561586 ),
562587 (
563588 [PartitionField (source_id = 9 , field_id = 1001 , transform = HourTransform (), name = "timestamptz_field_hour" )],
577602 VALUES
578603 (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
579604 """ ,
605+ None ,
580606 ),
581607 # Truncate Transform
582608 (
597623 VALUES
598624 (12345, 'Sample data for int');
599625 """ ,
626+ None ,
600627 ),
601628 (
602629 [PartitionField (source_id = 5 , field_id = 1001 , transform = TruncateTransform (2 ), name = "bigint_field_trunc" )],
616643 VALUES
617644 (4294967297, 'Sample data for long');
618645 """ ,
646+ None ,
619647 ),
620648 (
621649 [PartitionField (source_id = 2 , field_id = 1001 , transform = TruncateTransform (3 ), name = "string_field_trunc" )],
635663 VALUES
636664 ('abcdefg', 'Another sample for string');
637665 """ ,
666+ None ,
638667 ),
639668 (
640669 [PartitionField (source_id = 13 , field_id = 1001 , transform = TruncateTransform (width = 5 ), name = "decimal_field_trunc" )],
654683 VALUES
655684 (678.90, 'Associated string value for decimal 678.90')
656685 """ ,
686+ None ,
657687 ),
658688 (
659689 [PartitionField (source_id = 11 , field_id = 1001 , transform = TruncateTransform (10 ), name = "binary_field_trunc" )],
673703 VALUES
674704 (binary('HELLOICEBERG'), 'Sample data for binary');
675705 """ ,
706+ None ,
676707 ),
677708 # Bucket Transform
678709 (
693724 VALUES
694725 (10, 'Integer with value 10');
695726 """ ,
727+ None ,
696728 ),
697729 # Test multiple field combinations could generate the Partition record and hive partition path correctly
698730 (
721753 VALUES
722754 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
723755 """ ,
756+ None ,
757+ ),
758+ # Test that special characters are URL-encoded
759+ (
760+ [PartitionField (source_id = 15 , field_id = 1001 , transform = IdentityTransform (), name = "special#string+field" )],
761+ ["special string" ],
762+ Record (** {"special#string+field" : "special string" }), # type: ignore
763+ "special%23string%2Bfield=special+string" ,
764+ f"""CREATE TABLE { identifier } (
765+ `special#string+field` string
766+ )
767+ USING iceberg
768+ PARTITIONED BY (
769+ identity(`special#string+field`)
770+ )
771+ """ ,
772+ f"""INSERT INTO { identifier }
773+ VALUES
774+ ('special string')
775+ """ ,
776+ lambda name : name .replace ("#" , "_x23" ).replace ("+" , "_x2B" ),
724777 ),
725778 ],
726779)
@@ -734,6 +787,7 @@ def test_partition_key(
734787 expected_hive_partition_path_slice : str ,
735788 spark_create_table_sql_for_justification : str ,
736789 spark_data_insert_sql_for_justification : str ,
790+ make_compatible_name : Optional [Callable [[str ], str ]],
737791) -> None :
738792 partition_field_values = [PartitionFieldValue (field , value ) for field , value in zip (partition_fields , partition_values )]
739793 spec = PartitionSpec (* partition_fields )
@@ -768,5 +822,12 @@ def test_partition_key(
768822 spark_path_for_justification = (
769823 snapshot .manifests (iceberg_table .io )[0 ].fetch_manifest_entry (iceberg_table .io )[0 ].data_file .file_path
770824 )
771- assert spark_partition_for_justification == expected_partition_record
825+ # Special characters in partition value are sanitized when written to the data file's partition field
826+ # Use `make_compatible_name` to match the sanitize behavior
827+ sanitized_record = (
828+ Record (** {make_compatible_name (k ): v for k , v in vars (expected_partition_record ).items ()})
829+ if make_compatible_name
830+ else expected_partition_record
831+ )
832+ assert spark_partition_for_justification == sanitized_record
772833 assert expected_hive_partition_path_slice in spark_path_for_justification
0 commit comments