From 385217f145d79e8bfc63fe04bb4c48782577ea89 Mon Sep 17 00:00:00 2001 From: Arunodoy18 Date: Thu, 20 Nov 2025 15:20:14 +0530 Subject: [PATCH] BEAM-13231: Add glob wildcard escaping utilities to FileSystems This commit adds support for escaping and unescaping glob wildcard characters in file path specifications, addressing the issue where files with literal glob metacharacters (*, ?, {, }) in their names cannot be matched. Changes: - Added escapeGlobWildcards(String spec) method to escape glob metacharacters by prefixing them with backslash - Added unescapeGlobWildcards(String spec) method to remove backslash prefixes from escaped glob characters - Added comprehensive test cases for both methods including round-trip testing These utilities provide the foundation for allowing users to treat glob metacharacters as literals when they appear in actual filenames. Fixes BEAM-13231 --- .../org/apache/beam/sdk/io/FileSystems.java | 36 +++++++++++++ .../apache/beam/sdk/io/FileSystemsTest.java | 50 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java index 7e2940a2c35b..2edc5c7ba45b 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java @@ -78,6 +78,8 @@ public class FileSystems { private static final Pattern FILE_SCHEME_PATTERN = Pattern.compile("(?[a-zA-Z][-a-zA-Z0-9+.]*):/.*"); private static final Pattern GLOB_PATTERN = Pattern.compile("[*?{}]"); + private static final Pattern ESCAPED_GLOB_PATTERN = Pattern.compile("\\\\[*?{}]"); + private static final String GLOB_ESCAPE_PREFIX = "\\"; private static final AtomicReference> FILESYSTEM_REVISION = new AtomicReference<>(); @@ -92,6 +94,40 @@ public static boolean hasGlobWildcard(String spec) { return GLOB_PATTERN.matcher(spec).find(); } + /** + * Escapes glob wildcard characters in the given spec so they are treated as literals. + * + *

This method escapes the characters '*', '?', '{', and '}' by prefixing them with a + * backslash, allowing them to be treated as literal characters in a file path rather than as + * glob wildcards. + * + *

Example: {@code escapeGlobWildcards("file*.txt")} returns {@code "file\\*.txt"} + * + * @param spec the file path specification to escape + * @return the escaped specification + */ + public static String escapeGlobWildcards(String spec) { + checkNotNull(spec, "spec cannot be null"); + return spec.replaceAll("([*?{}])", "\\\\$1"); + } + + /** + * Unescapes glob wildcard characters in the given spec that were previously escaped with {@link + * #escapeGlobWildcards(String)}. + * + *

This method removes the backslash prefix from escaped glob characters ('*', '?', '{', '}'), + * restoring them to their unescaped form. + * + *

Example: {@code unescapeGlobWildcards("file\\*.txt")} returns {@code "file*.txt"} + * + * @param spec the file path specification to unescape + * @return the unescaped specification + */ + public static String unescapeGlobWildcards(String spec) { + checkNotNull(spec, "spec cannot be null"); + return spec.replaceAll("\\\\([*?{}])", "$1"); + } + /** * This is the entry point to convert user-provided specs to {@link ResourceId ResourceIds}. * Callers should use {@link #match} to resolve users specs ambiguities before calling other diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileSystemsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileSystemsTest.java index 34567309c7d0..a513c1b1f462 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileSystemsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileSystemsTest.java @@ -354,6 +354,56 @@ private void createFileWithContent(Path path, String content) throws Exception { } } + @Test + public void testEscapeGlobWildcards() { + // Test escaping asterisk + assertEquals("file\\*.txt", FileSystems.escapeGlobWildcards("file*.txt")); + + // Test escaping question mark + assertEquals("file\\?.txt", FileSystems.escapeGlobWildcards("file?.txt")); + + // Test escaping braces + assertEquals("file\\{1,2\\}.txt", FileSystems.escapeGlobWildcards("file{1,2}.txt")); + + // Test escaping multiple characters + assertEquals("\\*\\?\\{\\}.txt", FileSystems.escapeGlobWildcards("*?{}.txt")); + + // Test string with no glob characters + assertEquals("file.txt", FileSystems.escapeGlobWildcards("file.txt")); + + // Test empty string + assertEquals("", FileSystems.escapeGlobWildcards("")); + } + + @Test + public void testUnescapeGlobWildcards() { + // Test unescaping asterisk + assertEquals("file*.txt", FileSystems.unescapeGlobWildcards("file\\*.txt")); + + // Test unescaping question mark + assertEquals("file?.txt", FileSystems.unescapeGlobWildcards("file\\?.txt")); + + // Test unescaping braces + assertEquals("file{1,2}.txt", FileSystems.unescapeGlobWildcards("file\\{1,2\\}.txt")); + + // Test unescaping multiple characters + assertEquals("*?{}.txt", FileSystems.unescapeGlobWildcards("\\*\\?\\{\\}.txt")); + + // Test string with no escaped characters + assertEquals("file.txt", FileSystems.unescapeGlobWildcards("file.txt")); + + // Test empty string + assertEquals("", FileSystems.unescapeGlobWildcards("")); + } + + @Test + public void testEscapeUnescapeRoundTrip() { + String original = "file*test?.txt"; + String escaped = FileSystems.escapeGlobWildcards(original); + String unescaped = FileSystems.unescapeGlobWildcards(escaped); + assertEquals(original, unescaped); + } + private LocalResourceId toLocalResourceId(String str) throws Exception { boolean isDirectory; if (SystemUtils.IS_OS_WINDOWS) {