Skip to content

Commit 5da1f4d

Browse files
smaheshwar-pltrSreesh Maheshwar
andauthored
URL-encode partition field names in file locations (apache#1457)
* URL-encode partition field names in file locations * Separate into variable * Add test * Revert to main * Failing test * Disable justication from test * Use `quote_plus` instead of `quote` to match Java behaviour * Temporarily update test to pass * Uncomment test * Add unit test * Fix typo in comment * Add `make_name_compatible` suggestion so test passes * Fix typo in schema field name --------- Co-authored-by: Sreesh Maheshwar <[email protected]>
1 parent e646500 commit 5da1f4d

File tree

3 files changed

+92
-7
lines changed

3 files changed

+92
-7
lines changed

pyiceberg/partitioning.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
Tuple,
3131
TypeVar,
3232
)
33-
from urllib.parse import quote
33+
from urllib.parse import quote_plus
3434

3535
from pydantic import (
3636
BeforeValidator,
@@ -234,9 +234,11 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
234234
partition_field = self.fields[pos]
235235
value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos])
236236

237-
value_str = quote(value_str, safe="")
237+
value_str = quote_plus(value_str, safe="")
238238
value_strs.append(value_str)
239-
field_strs.append(partition_field.name)
239+
240+
field_str = quote_plus(partition_field.name, safe="")
241+
field_strs.append(field_str)
240242

241243
path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
242244
return path

tests/integration/test_partitioning_key.py

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import uuid
1919
from datetime import date, datetime, timedelta, timezone
2020
from decimal import Decimal
21-
from typing import Any, List
21+
from typing import Any, Callable, List, Optional
2222

2323
import pytest
2424
from pyspark.sql import SparkSession
@@ -70,14 +70,15 @@
7070
NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False),
7171
NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False),
7272
NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False),
73+
NestedField(field_id=15, name="special#string+field", field_type=StringType(), required=False),
7374
)
7475

7576

7677
identifier = "default.test_table"
7778

7879

7980
@pytest.mark.parametrize(
80-
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
81+
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
8182
[
8283
# # Identity Transform
8384
(
@@ -98,6 +99,7 @@
9899
VALUES
99100
(false, 'Boolean field set to false');
100101
""",
102+
None,
101103
),
102104
(
103105
[PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -117,6 +119,7 @@
117119
VALUES
118120
('sample_string', 'Another string value')
119121
""",
122+
None,
120123
),
121124
(
122125
[PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -136,6 +139,7 @@
136139
VALUES
137140
(42, 'Associated string value for int 42')
138141
""",
142+
None,
139143
),
140144
(
141145
[PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -155,6 +159,7 @@
155159
VALUES
156160
(1234567890123456789, 'Associated string value for long 1234567890123456789')
157161
""",
162+
None,
158163
),
159164
(
160165
[PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -178,6 +183,7 @@
178183
# VALUES
179184
# (3.14, 'Associated string value for float 3.14')
180185
# """
186+
None,
181187
),
182188
(
183189
[PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -201,6 +207,7 @@
201207
# VALUES
202208
# (6.282, 'Associated string value for double 6.282')
203209
# """
210+
None,
204211
),
205212
(
206213
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -220,6 +227,7 @@
220227
VALUES
221228
(CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
222229
""",
230+
None,
223231
),
224232
(
225233
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -239,6 +247,7 @@
239247
VALUES
240248
(CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
241249
""",
250+
None,
242251
),
243252
(
244253
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -263,6 +272,7 @@
263272
# VALUES
264273
# (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
265274
# """
275+
None,
266276
),
267277
(
268278
[PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -287,6 +297,7 @@
287297
# VALUES
288298
# (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
289299
# """
300+
None,
290301
),
291302
(
292303
[PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -306,6 +317,7 @@
306317
VALUES
307318
(CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
308319
""",
320+
None,
309321
),
310322
(
311323
[PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -325,6 +337,7 @@
325337
VALUES
326338
('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
327339
""",
340+
None,
328341
),
329342
(
330343
[PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -344,6 +357,7 @@
344357
VALUES
345358
(CAST('example' AS BINARY), 'Associated string value for binary `example`')
346359
""",
360+
None,
347361
),
348362
(
349363
[PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -363,6 +377,7 @@
363377
VALUES
364378
(123.45, 'Associated string value for decimal 123.45')
365379
""",
380+
None,
366381
),
367382
# # Year Month Day Hour Transform
368383
# Month Transform
@@ -384,6 +399,7 @@
384399
VALUES
385400
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
386401
""",
402+
None,
387403
),
388404
(
389405
[PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -403,6 +419,7 @@
403419
VALUES
404420
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
405421
""",
422+
None,
406423
),
407424
(
408425
[PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -422,6 +439,7 @@
422439
VALUES
423440
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
424441
""",
442+
None,
425443
),
426444
# Year Transform
427445
(
@@ -442,6 +460,7 @@
442460
VALUES
443461
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
444462
""",
463+
None,
445464
),
446465
(
447466
[PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -461,6 +480,7 @@
461480
VALUES
462481
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
463482
""",
483+
None,
464484
),
465485
(
466486
[PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -480,6 +500,7 @@
480500
VALUES
481501
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
482502
""",
503+
None,
483504
),
484505
# # Day Transform
485506
(
@@ -500,6 +521,7 @@
500521
VALUES
501522
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
502523
""",
524+
None,
503525
),
504526
(
505527
[PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -519,6 +541,7 @@
519541
VALUES
520542
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
521543
""",
544+
None,
522545
),
523546
(
524547
[PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -538,6 +561,7 @@
538561
VALUES
539562
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
540563
""",
564+
None,
541565
),
542566
# Hour Transform
543567
(
@@ -558,6 +582,7 @@
558582
VALUES
559583
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
560584
""",
585+
None,
561586
),
562587
(
563588
[PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -577,6 +602,7 @@
577602
VALUES
578603
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
579604
""",
605+
None,
580606
),
581607
# Truncate Transform
582608
(
@@ -597,6 +623,7 @@
597623
VALUES
598624
(12345, 'Sample data for int');
599625
""",
626+
None,
600627
),
601628
(
602629
[PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -616,6 +643,7 @@
616643
VALUES
617644
(4294967297, 'Sample data for long');
618645
""",
646+
None,
619647
),
620648
(
621649
[PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -635,6 +663,7 @@
635663
VALUES
636664
('abcdefg', 'Another sample for string');
637665
""",
666+
None,
638667
),
639668
(
640669
[PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -654,6 +683,7 @@
654683
VALUES
655684
(678.90, 'Associated string value for decimal 678.90')
656685
""",
686+
None,
657687
),
658688
(
659689
[PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -673,6 +703,7 @@
673703
VALUES
674704
(binary('HELLOICEBERG'), 'Sample data for binary');
675705
""",
706+
None,
676707
),
677708
# Bucket Transform
678709
(
@@ -693,6 +724,7 @@
693724
VALUES
694725
(10, 'Integer with value 10');
695726
""",
727+
None,
696728
),
697729
# Test multiple field combinations could generate the Partition record and hive partition path correctly
698730
(
@@ -721,6 +753,27 @@
721753
VALUES
722754
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
723755
""",
756+
None,
757+
),
758+
# Test that special characters are URL-encoded
759+
(
760+
[PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
761+
["special string"],
762+
Record(**{"special#string+field": "special string"}), # type: ignore
763+
"special%23string%2Bfield=special+string",
764+
f"""CREATE TABLE {identifier} (
765+
`special#string+field` string
766+
)
767+
USING iceberg
768+
PARTITIONED BY (
769+
identity(`special#string+field`)
770+
)
771+
""",
772+
f"""INSERT INTO {identifier}
773+
VALUES
774+
('special string')
775+
""",
776+
lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
724777
),
725778
],
726779
)
@@ -734,6 +787,7 @@ def test_partition_key(
734787
expected_hive_partition_path_slice: str,
735788
spark_create_table_sql_for_justification: str,
736789
spark_data_insert_sql_for_justification: str,
790+
make_compatible_name: Optional[Callable[[str], str]],
737791
) -> None:
738792
partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
739793
spec = PartitionSpec(*partition_fields)
@@ -768,5 +822,12 @@ def test_partition_key(
768822
spark_path_for_justification = (
769823
snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
770824
)
771-
assert spark_partition_for_justification == expected_partition_record
825+
# Special characters in partition value are sanitized when written to the data file's partition field
826+
# Use `make_compatible_name` to match the sanitize behavior
827+
sanitized_record = (
828+
Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
829+
if make_compatible_name
830+
else expected_partition_record
831+
)
832+
assert spark_partition_for_justification == sanitized_record
772833
assert expected_hive_partition_path_slice in spark_path_for_justification

tests/table/test_partitioning.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
# under the License.
1717
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
1818
from pyiceberg.schema import Schema
19-
from pyiceberg.transforms import BucketTransform, TruncateTransform
19+
from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform
20+
from pyiceberg.typedef import Record
2021
from pyiceberg.types import (
2122
IntegerType,
2223
NestedField,
@@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None:
118119
)
119120

120121

122+
def test_partition_spec_to_path() -> None:
123+
schema = Schema(
124+
NestedField(field_id=1, name="str", field_type=StringType(), required=False),
125+
NestedField(field_id=2, name="other_str", field_type=StringType(), required=False),
126+
NestedField(field_id=3, name="int", field_type=IntegerType(), required=True),
127+
)
128+
129+
spec = PartitionSpec(
130+
PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"),
131+
PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"),
132+
PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"),
133+
spec_id=3,
134+
)
135+
136+
record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10}) # type: ignore
137+
138+
# Both partition field names and values should be URL encoded, with spaces mapping to plus signs, to match the Java
139+
# behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204
140+
assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10"
141+
142+
121143
def test_partition_type(table_schema_simple: Schema) -> None:
122144
spec = PartitionSpec(
123145
PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"),

0 commit comments

Comments
 (0)