Fix url_encode_hive_values tests to avoid filesystem limitations

CytoShahar · CytoShahar · commit e166b0ecda2d · 2025-11-09T00:44:21.000Z
Changed test data from using forward slash (/) to plus (+) in partition
values, as forward slashes cannot be used in directory names on Unix/macOS
filesystems. The tests now use characters that are valid in filenames but
still demonstrate the URL encoding functionality:
- Space (encoded as %20)
- Plus + (encoded as %2B)
- Ampersand &amp; (encoded as %26)
- Percent % (encoded as %25)

This ensures tests pass on all platforms while still validating that
url_encode_hive_values parameter correctly controls URL encoding behavior.
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -2385,49 +2385,51 @@ def test_hive_partitioning_dictionary_key(multisourcefs):
 
 def test_hive_partitioning_url_encoding(tempdir):
     # Test url_encode_hive_values parameter for write_dataset
+    # Note: Forward slash (/) cannot be used in directory names on Unix systems,
+    # so we test with other special characters that are valid in filenames
     table = pa.table({
         'id': [1, 2, 3],
-        'category': ['Product A/B', 'Site C&D', 'Normal Item']
+        'category': ['Product A+B', 'Site C&D', 'Normal Item']
     })
-    
+
     # Test with URL encoding enabled (default)
     encoded_dir = tempdir / 'encoded'
-    ds.write_dataset(table, encoded_dir, format='ipc', 
-                     partitioning=['category'], 
+    ds.write_dataset(table, encoded_dir, format='ipc',
+                     partitioning=['category'],
                      partitioning_flavor='hive',
                      url_encode_hive_values=True)
-    
+
     # Check that directories are URL-encoded
     dirs = [d.name for d in encoded_dir.iterdir() if d.is_dir()]
-    assert 'category=Product%20A%2FB' in dirs
+    assert 'category=Product%20A%2BB' in dirs
     assert 'category=Site%20C%26D' in dirs
     assert 'category=Normal%20Item' in dirs
-    
+
     # Test with URL encoding disabled
     not_encoded_dir = tempdir / 'not_encoded'
     ds.write_dataset(table, not_encoded_dir, format='ipc',
-                     partitioning=['category'], 
+                     partitioning=['category'],
                      partitioning_flavor='hive',
                      url_encode_hive_values=False)
-    
+
     # Check that directories are NOT URL-encoded
     dirs = [d.name for d in not_encoded_dir.iterdir() if d.is_dir()]
-    assert 'category=Product A/B' in dirs
+    assert 'category=Product A+B' in dirs
     assert 'category=Site C&D' in dirs
     assert 'category=Normal Item' in dirs
-    
+
     # Test that both datasets can be read correctly
     encoded_dataset = ds.dataset(encoded_dir, format='ipc', partitioning='hive')
     not_encoded_dataset = ds.dataset(not_encoded_dir, format='ipc', partitioning='hive')
-    
+
     # Both should read the same data
     encoded_table = encoded_dataset.to_table().sort_by('id')
     not_encoded_table = not_encoded_dataset.to_table().sort_by('id')
     original_table = table.sort_by('id')
-    
+
     assert encoded_table.equals(original_table)
     assert not_encoded_table.equals(original_table)
-    
+
     # Test with explicitly created HivePartitioning object
     explicit_dir = tempdir / 'explicit'
     partitioning = ds.HivePartitioning(
@@ -2437,9 +2439,9 @@ def test_hive_partitioning_url_encoding(tempdir):
     ds.write_dataset(table, explicit_dir, format='ipc',
                      partitioning=partitioning,
                      url_encode_hive_values=False)  # Should be respected
-    
+
     dirs = [d.name for d in explicit_dir.iterdir() if d.is_dir()]
-    assert 'category=Product A/B' in dirs
+    assert 'category=Product A+B' in dirs
     assert 'category=Site C&D' in dirs
     assert 'category=Normal Item' in dirs
 
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
@@ -1019,40 +1019,42 @@ test_that("Dataset write wrappers can write flat files using readr::write_csv()
 
 test_that("write_dataset respects url_encode_hive_values parameter", {
   skip_if_not_available("parquet")
-  
+
   # Create test data with special characters that would be URL encoded
+  # Note: Forward slash (/) cannot be used in directory names on Unix systems,
+  # so we test with other special characters that are valid in filenames
   test_df <- data.frame(
     value = c(1, 2, 3, 4),
-    category = c("test space", "test/slash", "test%percent", "normal"),
+    category = c("test space", "test+plus", "test%percent", "normal"),
     stringsAsFactors = FALSE
   )
-  
+
   # Test with URL encoding enabled (default)
   dst_dir_encoded <- make_temp_dir()
   write_dataset(test_df, dst_dir_encoded, partitioning = "category", hive_style = TRUE, url_encode_hive_values = TRUE)
-  
-  # Test with URL encoding disabled  
+
+  # Test with URL encoding disabled
   dst_dir_not_encoded <- make_temp_dir()
   write_dataset(test_df, dst_dir_not_encoded, partitioning = "category", hive_style = TRUE, url_encode_hive_values = FALSE)
-  
+
   # Check that the directories are different (encoded vs not encoded)
   encoded_dirs <- list.dirs(dst_dir_encoded, recursive = FALSE)
   not_encoded_dirs <- list.dirs(dst_dir_not_encoded, recursive = FALSE)
-  
+
   # The encoded version should have URL-encoded directory names
   expect_true(any(grepl("test%20space", encoded_dirs)))
-  expect_true(any(grepl("test%2Fslash", encoded_dirs))) 
+  expect_true(any(grepl("test%2Bplus", encoded_dirs)))
   expect_true(any(grepl("test%25percent", encoded_dirs)))
-  
+
   # The non-encoded version should have raw directory names
   expect_true(any(grepl("test space", not_encoded_dirs, fixed = TRUE)))
-  expect_true(any(grepl("test/slash", not_encoded_dirs, fixed = TRUE)))
+  expect_true(any(grepl("test+plus", not_encoded_dirs, fixed = TRUE)))
   expect_true(any(grepl("test%percent", not_encoded_dirs, fixed = TRUE)))
-  
+
   # Both datasets should be readable and equivalent when loaded
   ds_encoded <- open_dataset(dst_dir_encoded)
   ds_not_encoded <- open_dataset(dst_dir_not_encoded)
-  
+
   expect_equal(
     arrange(ds_encoded %>% collect(), value),
     arrange(ds_not_encoded %>% collect(), value)