Skip to content

Commit ddef1a4

Browse files
Copilotjoocer
andcommitted
Improve wildcard matching to use glob-like semantics
Co-authored-by: joocer <[email protected]>
1 parent 66a40fe commit ddef1a4

File tree

2 files changed

+38
-5
lines changed

2 files changed

+38
-5
lines changed

opteryx/utils/paths.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,39 @@ def split_wildcard_path(path: str):
106106

107107
def match_wildcard(pattern: str, path: str) -> bool:
108108
"""
109-
Match a path against a wildcard pattern.
109+
Match a path against a wildcard pattern using glob-like semantics.
110+
111+
Unlike fnmatch, this function treats path separators specially:
112+
- '*' matches any characters EXCEPT path separators
113+
- '?' matches any single character EXCEPT path separators
114+
- Use '**' to match across directory boundaries (not yet supported)
115+
116+
This ensures consistent behavior with glob.glob() used for local files.
110117
111118
Args:
112119
pattern: Pattern with wildcards (e.g., "bucket/path/*.parquet")
113120
path: Path to match (e.g., "bucket/path/file1.parquet")
114121
115122
Returns:
116123
True if path matches pattern
124+
125+
Examples:
126+
>>> match_wildcard("bucket/path/*.parquet", "bucket/path/file.parquet")
127+
True
128+
>>> match_wildcard("bucket/path/*.parquet", "bucket/path/sub/file.parquet")
129+
False
117130
"""
118-
return fnmatch.fnmatch(path, pattern)
131+
# Split pattern and path into parts
132+
pattern_parts = pattern.split(OS_SEP)
133+
path_parts = path.split(OS_SEP)
134+
135+
# Must have same number of parts for a match (unless using ** which we don't support yet)
136+
if len(pattern_parts) != len(path_parts):
137+
return False
138+
139+
# Match each part using fnmatch
140+
for pattern_part, path_part in zip(pattern_parts, path_parts):
141+
if not fnmatch.fnmatch(path_part, pattern_part):
142+
return False
143+
144+
return True

tests/unit/utils/test_paths_wildcards.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,16 @@ def test_split_wildcard_path():
5555

5656

5757
def test_match_wildcard():
58-
"""Test wildcard pattern matching"""
59-
# Asterisk matches multiple characters
58+
"""Test wildcard pattern matching with glob-like semantics"""
59+
# Asterisk matches multiple characters (but not path separators)
6060
assert paths.match_wildcard("bucket/path/*.parquet", "bucket/path/file1.parquet") is True
6161
assert paths.match_wildcard("bucket/path/*.parquet", "bucket/path/file2.parquet") is True
6262
assert paths.match_wildcard("bucket/path/*.parquet", "bucket/path/data.csv") is False
6363

64-
# Question mark matches single character
64+
# Asterisk does NOT match across directory boundaries (glob-like behavior)
65+
assert paths.match_wildcard("bucket/path/*.parquet", "bucket/path/subdir/file.parquet") is False
66+
67+
# Question mark matches single character (but not path separators)
6568
assert paths.match_wildcard("bucket/path/file?.parquet", "bucket/path/file1.parquet") is True
6669
assert paths.match_wildcard("bucket/path/file?.parquet", "bucket/path/file2.parquet") is True
6770
assert paths.match_wildcard("bucket/path/file?.parquet", "bucket/path/file10.parquet") is False
@@ -71,6 +74,10 @@ def test_match_wildcard():
7174
assert paths.match_wildcard("bucket/path/file[0-9].parquet", "bucket/path/file5.parquet") is True
7275
assert paths.match_wildcard("bucket/path/file[0-9].parquet", "bucket/path/fileA.parquet") is False
7376

77+
# Wildcard in middle of path
78+
assert paths.match_wildcard("bucket/*/data.parquet", "bucket/subdir/data.parquet") is True
79+
assert paths.match_wildcard("bucket/*/data.parquet", "bucket/a/b/data.parquet") is False
80+
7481
# No wildcards - exact match
7582
assert paths.match_wildcard("bucket/path/data.parquet", "bucket/path/data.parquet") is True
7683
assert paths.match_wildcard("bucket/path/data.parquet", "bucket/path/other.parquet") is False

0 commit comments

Comments
 (0)