Skip to content

Commit bfc0589

Browse files
Copilotjoocer
andcommitted
Add wildcard support to FileConnector
Co-authored-by: joocer <[email protected]>
1 parent 9e00242 commit bfc0589

File tree

7 files changed

+306
-16
lines changed

7 files changed

+306
-16
lines changed

opteryx/connectors/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,9 @@ def connector_factory(dataset, statistics, **config):
269269
connector = _lazy_import_connector(connector)
270270
break
271271
else:
272-
if os.path.isfile(dataset):
272+
# Check if dataset is a file or contains wildcards
273+
has_wildcards = any(char in dataset for char in ['*', '?', '['])
274+
if os.path.isfile(dataset) or has_wildcards:
273275
from opteryx.connectors import file_connector
274276

275277
return file_connector.FileConnector(dataset=dataset, statistics=statistics)

opteryx/connectors/file_connector.py

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
dataset name in a query.
99
"""
1010

11+
import glob
1112
import mmap
1213
import os
1314
from typing import Dict
15+
from typing import List
1416
from typing import Optional
1517

1618
import pyarrow
@@ -134,23 +136,76 @@ def __init__(self, *args, **kwargs):
134136
if ".." in self.dataset or self.dataset[0] in ("\\", "/", "~"):
135137
# Don't find any datasets which look like path traversal
136138
raise DatasetNotFoundError(dataset=self.dataset)
137-
self.decoder = get_decoder(self.dataset)
139+
140+
# Check if dataset contains wildcards
141+
self.has_wildcards = any(char in self.dataset for char in ['*', '?', '['])
142+
143+
if self.has_wildcards:
144+
# Expand wildcards to get list of files
145+
self.files = self._expand_wildcards(self.dataset)
146+
if not self.files:
147+
raise DatasetNotFoundError(dataset=self.dataset)
148+
# Use the first file to determine the decoder
149+
self.decoder = get_decoder(self.files[0])
150+
else:
151+
self.files = [self.dataset]
152+
self.decoder = get_decoder(self.dataset)
153+
154+
def _expand_wildcards(self, pattern: str) -> List[str]:
155+
"""
156+
Expand wildcard patterns in file paths while preventing path traversal.
157+
158+
Supports wildcards:
159+
- * matches any number of characters
160+
- ? matches a single character
161+
- [range] matches a range of characters (e.g., [0-9], [a-z])
162+
163+
Args:
164+
pattern: File path pattern with wildcards
165+
166+
Returns:
167+
List of matching file paths
168+
"""
169+
# Additional path traversal check after expansion
170+
if ".." in pattern:
171+
raise DatasetNotFoundError(dataset=pattern)
172+
173+
# Use glob to expand the pattern
174+
matched_files = glob.glob(pattern, recursive=False)
175+
176+
# Filter out any results that might have path traversal
177+
# This is an extra safety check
178+
safe_files = []
179+
for file_path in matched_files:
180+
if ".." not in file_path and os.path.isfile(file_path):
181+
safe_files.append(file_path)
182+
183+
return sorted(safe_files)
138184

139185
def read_dataset(
140186
self, columns: list = None, predicates: list = None, limit: int = None, **kwargs
141187
) -> pyarrow.Table:
142-
morsel = read_blob(
143-
blob_name=self.dataset,
144-
decoder=self.decoder,
145-
statistics=self.statistics,
146-
projection=columns,
147-
selection=predicates,
148-
)[3]
149-
150-
if limit is not None:
151-
morsel = morsel.slice(offset=0, length=limit)
152-
153-
yield morsel
188+
rows_read = 0
189+
190+
# Iterate over all matched files
191+
for file_path in self.files:
192+
morsel = read_blob(
193+
blob_name=file_path,
194+
decoder=self.decoder,
195+
statistics=self.statistics,
196+
projection=columns,
197+
selection=predicates,
198+
)[3]
199+
200+
if limit is not None:
201+
remaining = limit - rows_read
202+
if remaining <= 0:
203+
break
204+
if morsel.num_rows > remaining:
205+
morsel = morsel.slice(offset=0, length=remaining)
206+
rows_read += morsel.num_rows
207+
208+
yield morsel
154209

155210
def get_dataset_schema(self) -> RelationSchema:
156211
"""
@@ -164,9 +219,12 @@ def get_dataset_schema(self) -> RelationSchema:
164219
if self.schema is not None:
165220
return self.schema
166221

222+
# Use the first file to get the schema
223+
first_file = self.files[0]
224+
167225
try:
168-
file_descriptor = os.open(self.dataset, os.O_RDONLY | os.O_BINARY)
169-
size = os.path.getsize(self.dataset)
226+
file_descriptor = os.open(first_file, os.O_RDONLY | os.O_BINARY)
227+
size = os.path.getsize(first_file)
170228
_map = mmap.mmap(file_descriptor, size, access=mmap.ACCESS_READ)
171229
self.schema = self.decoder(_map, just_schema=True)
172230
self.relation_statistics = self.decoder(_map, just_statistics=True)
10.8 MB
Binary file not shown.
10.8 MB
Binary file not shown.
10.8 MB
Binary file not shown.
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""
2+
Integration tests for wildcard support in file paths
3+
"""
4+
5+
import os
6+
import sys
7+
8+
sys.path.insert(1, os.path.join(sys.path[0], "../.."))
9+
10+
import pytest
11+
12+
# Skip this if opteryx is not properly installed
13+
try:
14+
import opteryx
15+
except ImportError:
16+
pytest.skip("opteryx not installed", allow_module_level=True)
17+
18+
19+
def test_wildcard_asterisk():
20+
"""Test SELECT with * wildcard in path"""
21+
result = opteryx.query("SELECT COUNT(*) FROM 'testdata/wildcard_test/*.parquet'")
22+
23+
# Should read from all 3 parquet files
24+
# Each file has 100000 rows, so total should be 300000
25+
count = result.arrow().column(0)[0].as_py()
26+
assert count == 300000, f"Expected 300000 rows, got {count}"
27+
28+
29+
def test_wildcard_question_mark_range():
30+
"""Test SELECT with range wildcard [1-2] in path"""
31+
result = opteryx.query("SELECT COUNT(*) FROM 'testdata/wildcard_test/file[1-2].parquet'")
32+
33+
# Should read from file1 and file2 only (200000 rows total)
34+
count = result.arrow().column(0)[0].as_py()
35+
assert count == 200000, f"Expected 200000 rows, got {count}"
36+
37+
38+
def test_wildcard_specific_columns():
39+
"""Test SELECT specific columns with wildcard path"""
40+
result = opteryx.query("SELECT user_name FROM 'testdata/wildcard_test/*.parquet' LIMIT 5")
41+
42+
# Should return results
43+
assert result.rowcount == 5
44+
assert "user_name" in result.column_names
45+
46+
47+
def test_wildcard_with_where_clause():
48+
"""Test SELECT with WHERE clause and wildcard path"""
49+
result = opteryx.query(
50+
"SELECT user_name, user_verified FROM 'testdata/wildcard_test/*.parquet' "
51+
"WHERE user_name ILIKE '%news%'"
52+
)
53+
54+
# Should read from all files and filter
55+
# Original single file has 122 matching rows, so 3 files should have 366
56+
assert result.rowcount == 366, f"Expected 366 rows, got {result.rowcount}"
57+
58+
59+
def test_wildcard_no_matches():
60+
"""Test that wildcard with no matches raises appropriate error"""
61+
with pytest.raises(Exception): # Should raise DatasetNotFoundError
62+
opteryx.query("SELECT * FROM 'testdata/nonexistent/*.parquet'")
63+
64+
65+
def test_wildcard_path_traversal_blocked():
66+
"""Test that path traversal is blocked even with wildcards"""
67+
with pytest.raises(Exception): # Should raise DatasetNotFoundError
68+
opteryx.query("SELECT * FROM '../*.parquet'")
69+
70+
71+
if __name__ == "__main__": # pragma: no cover
72+
# Run tests
73+
pytest.main([__file__, "-v"])
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""
2+
Test wildcard support in file paths
3+
"""
4+
5+
import os
6+
import sys
7+
import tempfile
8+
9+
sys.path.insert(1, os.path.join(sys.path[0], "../../.."))
10+
11+
import pytest
12+
13+
from opteryx.connectors.file_connector import FileConnector
14+
from opteryx.exceptions import DatasetNotFoundError
15+
16+
17+
class MockStatistics:
18+
"""Mock statistics object for testing"""
19+
def __init__(self):
20+
self.bytes_read = 0
21+
22+
23+
def test_wildcard_detection():
24+
"""Test that wildcards are correctly detected"""
25+
stats = MockStatistics()
26+
27+
# These should be detected as wildcards
28+
connector = FileConnector(dataset="path/*.parquet", statistics=stats)
29+
assert connector.has_wildcards is True
30+
31+
connector = FileConnector(dataset="path/file?.parquet", statistics=stats)
32+
assert connector.has_wildcards is True
33+
34+
connector = FileConnector(dataset="path/file[0-9].parquet", statistics=stats)
35+
assert connector.has_wildcards is True
36+
37+
38+
def test_wildcard_no_matches():
39+
"""Test that wildcard with no matches raises DatasetNotFoundError"""
40+
stats = MockStatistics()
41+
42+
with pytest.raises(DatasetNotFoundError):
43+
FileConnector(dataset="/nonexistent/path/*.parquet", statistics=stats)
44+
45+
46+
def test_path_traversal_protection():
47+
"""Test that path traversal is still blocked with wildcards"""
48+
stats = MockStatistics()
49+
50+
# These should raise DatasetNotFoundError due to path traversal
51+
with pytest.raises(DatasetNotFoundError):
52+
FileConnector(dataset="../*.parquet", statistics=stats)
53+
54+
with pytest.raises(DatasetNotFoundError):
55+
FileConnector(dataset="path/../../*.parquet", statistics=stats)
56+
57+
with pytest.raises(DatasetNotFoundError):
58+
FileConnector(dataset="~/*.parquet", statistics=stats)
59+
60+
61+
def test_wildcard_expansion():
62+
"""Test that wildcards are properly expanded to matching files"""
63+
# Create temporary test files
64+
with tempfile.TemporaryDirectory() as tmpdir:
65+
# Create some test files
66+
test_files = [
67+
os.path.join(tmpdir, "file1.txt"),
68+
os.path.join(tmpdir, "file2.txt"),
69+
os.path.join(tmpdir, "file3.txt"),
70+
]
71+
for f in test_files:
72+
with open(f, "w") as fp:
73+
fp.write("test content")
74+
75+
stats = MockStatistics()
76+
pattern = os.path.join(tmpdir, "*.txt")
77+
78+
connector = FileConnector(dataset=pattern, statistics=stats)
79+
80+
# Check that all files were found
81+
assert len(connector.files) == 3
82+
assert connector.has_wildcards is True
83+
84+
# Check files are sorted
85+
assert connector.files == sorted(test_files)
86+
87+
88+
def test_single_file_no_wildcard():
89+
"""Test that single files still work without wildcards"""
90+
with tempfile.TemporaryDirectory() as tmpdir:
91+
test_file = os.path.join(tmpdir, "test.txt")
92+
with open(test_file, "w") as fp:
93+
fp.write("test content")
94+
95+
stats = MockStatistics()
96+
connector = FileConnector(dataset=test_file, statistics=stats)
97+
98+
assert connector.has_wildcards is False
99+
assert connector.files == [test_file]
100+
101+
102+
def test_wildcard_range_pattern():
103+
"""Test wildcard with range patterns like [0-9]"""
104+
with tempfile.TemporaryDirectory() as tmpdir:
105+
# Create files matching a range pattern
106+
test_files = []
107+
for i in range(5):
108+
f = os.path.join(tmpdir, f"file{i}.txt")
109+
with open(f, "w") as fp:
110+
fp.write("test")
111+
test_files.append(f)
112+
113+
# Create a file that shouldn't match
114+
non_match = os.path.join(tmpdir, "fileX.txt")
115+
with open(non_match, "w") as fp:
116+
fp.write("test")
117+
118+
stats = MockStatistics()
119+
pattern = os.path.join(tmpdir, "file[0-9].txt")
120+
121+
connector = FileConnector(dataset=pattern, statistics=stats)
122+
123+
# Should match only files with digits
124+
assert len(connector.files) == 5
125+
assert all("file" in f and any(str(i) in f for i in range(5)) for f in connector.files)
126+
assert non_match not in connector.files
127+
128+
129+
def test_wildcard_question_mark():
130+
"""Test wildcard with ? (single character match)"""
131+
with tempfile.TemporaryDirectory() as tmpdir:
132+
# Create files
133+
file1 = os.path.join(tmpdir, "fileA.txt")
134+
file2 = os.path.join(tmpdir, "fileB.txt")
135+
file_no_match = os.path.join(tmpdir, "fileAB.txt")
136+
137+
for f in [file1, file2, file_no_match]:
138+
with open(f, "w") as fp:
139+
fp.write("test")
140+
141+
stats = MockStatistics()
142+
pattern = os.path.join(tmpdir, "file?.txt")
143+
144+
connector = FileConnector(dataset=pattern, statistics=stats)
145+
146+
# Should match only single-character files
147+
assert len(connector.files) == 2
148+
assert file1 in connector.files
149+
assert file2 in connector.files
150+
assert file_no_match not in connector.files
151+
152+
153+
if __name__ == "__main__": # pragma: no cover
154+
import sys
155+
156+
# Run tests
157+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)