Skip to content

Commit e166b0e

Browse files
committed
Fix url_encode_hive_values tests to avoid filesystem limitations
Changed test data from using forward slash (/) to plus (+) in partition values, as forward slashes cannot be used in directory names on Unix/macOS filesystems. The tests now use characters that are valid in filenames but still demonstrate the URL encoding functionality: - Space (encoded as %20) - Plus + (encoded as %2B) - Ampersand & (encoded as %26) - Percent % (encoded as %25) This ensures tests pass on all platforms while still validating that url_encode_hive_values parameter correctly controls URL encoding behavior.
1 parent 29d2325 commit e166b0e

File tree

2 files changed

+32
-28
lines changed

2 files changed

+32
-28
lines changed

python/pyarrow/tests/test_dataset.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2385,49 +2385,51 @@ def test_hive_partitioning_dictionary_key(multisourcefs):
23852385

23862386
def test_hive_partitioning_url_encoding(tempdir):
23872387
# Test url_encode_hive_values parameter for write_dataset
2388+
# Note: Forward slash (/) cannot be used in directory names on Unix systems,
2389+
# so we test with other special characters that are valid in filenames
23882390
table = pa.table({
23892391
'id': [1, 2, 3],
2390-
'category': ['Product A/B', 'Site C&D', 'Normal Item']
2392+
'category': ['Product A+B', 'Site C&D', 'Normal Item']
23912393
})
2392-
2394+
23932395
# Test with URL encoding enabled (default)
23942396
encoded_dir = tempdir / 'encoded'
2395-
ds.write_dataset(table, encoded_dir, format='ipc',
2396-
partitioning=['category'],
2397+
ds.write_dataset(table, encoded_dir, format='ipc',
2398+
partitioning=['category'],
23972399
partitioning_flavor='hive',
23982400
url_encode_hive_values=True)
2399-
2401+
24002402
# Check that directories are URL-encoded
24012403
dirs = [d.name for d in encoded_dir.iterdir() if d.is_dir()]
2402-
assert 'category=Product%20A%2FB' in dirs
2404+
assert 'category=Product%20A%2BB' in dirs
24032405
assert 'category=Site%20C%26D' in dirs
24042406
assert 'category=Normal%20Item' in dirs
2405-
2407+
24062408
# Test with URL encoding disabled
24072409
not_encoded_dir = tempdir / 'not_encoded'
24082410
ds.write_dataset(table, not_encoded_dir, format='ipc',
2409-
partitioning=['category'],
2411+
partitioning=['category'],
24102412
partitioning_flavor='hive',
24112413
url_encode_hive_values=False)
2412-
2414+
24132415
# Check that directories are NOT URL-encoded
24142416
dirs = [d.name for d in not_encoded_dir.iterdir() if d.is_dir()]
2415-
assert 'category=Product A/B' in dirs
2417+
assert 'category=Product A+B' in dirs
24162418
assert 'category=Site C&D' in dirs
24172419
assert 'category=Normal Item' in dirs
2418-
2420+
24192421
# Test that both datasets can be read correctly
24202422
encoded_dataset = ds.dataset(encoded_dir, format='ipc', partitioning='hive')
24212423
not_encoded_dataset = ds.dataset(not_encoded_dir, format='ipc', partitioning='hive')
2422-
2424+
24232425
# Both should read the same data
24242426
encoded_table = encoded_dataset.to_table().sort_by('id')
24252427
not_encoded_table = not_encoded_dataset.to_table().sort_by('id')
24262428
original_table = table.sort_by('id')
2427-
2429+
24282430
assert encoded_table.equals(original_table)
24292431
assert not_encoded_table.equals(original_table)
2430-
2432+
24312433
# Test with explicitly created HivePartitioning object
24322434
explicit_dir = tempdir / 'explicit'
24332435
partitioning = ds.HivePartitioning(
@@ -2437,9 +2439,9 @@ def test_hive_partitioning_url_encoding(tempdir):
24372439
ds.write_dataset(table, explicit_dir, format='ipc',
24382440
partitioning=partitioning,
24392441
url_encode_hive_values=False) # Should be respected
2440-
2442+
24412443
dirs = [d.name for d in explicit_dir.iterdir() if d.is_dir()]
2442-
assert 'category=Product A/B' in dirs
2444+
assert 'category=Product A+B' in dirs
24432445
assert 'category=Site C&D' in dirs
24442446
assert 'category=Normal Item' in dirs
24452447

r/tests/testthat/test-dataset-write.R

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,40 +1019,42 @@ test_that("Dataset write wrappers can write flat files using readr::write_csv()
10191019

10201020
test_that("write_dataset respects url_encode_hive_values parameter", {
10211021
skip_if_not_available("parquet")
1022-
1022+
10231023
# Create test data with special characters that would be URL encoded
1024+
# Note: Forward slash (/) cannot be used in directory names on Unix systems,
1025+
# so we test with other special characters that are valid in filenames
10241026
test_df <- data.frame(
10251027
value = c(1, 2, 3, 4),
1026-
category = c("test space", "test/slash", "test%percent", "normal"),
1028+
category = c("test space", "test+plus", "test%percent", "normal"),
10271029
stringsAsFactors = FALSE
10281030
)
1029-
1031+
10301032
# Test with URL encoding enabled (default)
10311033
dst_dir_encoded <- make_temp_dir()
10321034
write_dataset(test_df, dst_dir_encoded, partitioning = "category", hive_style = TRUE, url_encode_hive_values = TRUE)
1033-
1034-
# Test with URL encoding disabled
1035+
1036+
# Test with URL encoding disabled
10351037
dst_dir_not_encoded <- make_temp_dir()
10361038
write_dataset(test_df, dst_dir_not_encoded, partitioning = "category", hive_style = TRUE, url_encode_hive_values = FALSE)
1037-
1039+
10381040
# Check that the directories are different (encoded vs not encoded)
10391041
encoded_dirs <- list.dirs(dst_dir_encoded, recursive = FALSE)
10401042
not_encoded_dirs <- list.dirs(dst_dir_not_encoded, recursive = FALSE)
1041-
1043+
10421044
# The encoded version should have URL-encoded directory names
10431045
expect_true(any(grepl("test%20space", encoded_dirs)))
1044-
expect_true(any(grepl("test%2Fslash", encoded_dirs)))
1046+
expect_true(any(grepl("test%2Bplus", encoded_dirs)))
10451047
expect_true(any(grepl("test%25percent", encoded_dirs)))
1046-
1048+
10471049
# The non-encoded version should have raw directory names
10481050
expect_true(any(grepl("test space", not_encoded_dirs, fixed = TRUE)))
1049-
expect_true(any(grepl("test/slash", not_encoded_dirs, fixed = TRUE)))
1051+
expect_true(any(grepl("test+plus", not_encoded_dirs, fixed = TRUE)))
10501052
expect_true(any(grepl("test%percent", not_encoded_dirs, fixed = TRUE)))
1051-
1053+
10521054
# Both datasets should be readable and equivalent when loaded
10531055
ds_encoded <- open_dataset(dst_dir_encoded)
10541056
ds_not_encoded <- open_dataset(dst_dir_not_encoded)
1055-
1057+
10561058
expect_equal(
10571059
arrange(ds_encoded %>% collect(), value),
10581060
arrange(ds_not_encoded %>% collect(), value)

0 commit comments

Comments
 (0)