Skip to content

Commit 9214c45

Browse files
authored
Merge pull request #2845 from mabel-dev/copilot/support-wildcards-in-paths
Support wildcards in file paths for SQL queries
2 parents 4f56e03 + f1063a1 commit 9214c45

File tree

14 files changed

+326
-28
lines changed

14 files changed

+326
-28
lines changed

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,37 @@ _this example requires a data file, [space_missions.parquet](https://storage.goo
201201

202202
</details>
203203

204+
<details>
205+
<summary>Query Multiple Files with Wildcards</summary>
206+
207+
In this example, we are querying multiple files using wildcard patterns. Opteryx supports `*` (any characters), `?` (single character), and `[range]` patterns in file paths.
208+
209+
~~~python
210+
# Import the Opteryx query engine.
211+
import opteryx
212+
213+
# Execute a SQL query to select data from all parquet files in a directory.
214+
# The wildcard '*' matches any characters in the filename.
215+
result = opteryx.query("SELECT * FROM 'data/*.parquet' LIMIT 10;")
216+
217+
# Display the result.
218+
result.head()
219+
~~~
220+
221+
You can also use more specific patterns:
222+
223+
~~~python
224+
# Query files matching a range pattern, e.g., file1.parquet through file9.parquet
225+
result = opteryx.query("SELECT COUNT(*) FROM 'data/file[1-9].parquet';")
226+
227+
# Query files with specific naming patterns
228+
result = opteryx.query("SELECT * FROM 'logs/2024-01-*.jsonl';")
229+
~~~
230+
231+
_Wildcards work with all supported file formats (Parquet, JSONL, CSV, etc.) and prevent path traversal for security._
232+
233+
</details>
234+
204235
<details>
205236
<summary>Query Data in SQLite</summary>
206237

dev/build_counter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class VersionStatus(Enum):
3535
# 1) we don't load dependencies by storing it in __init__.py
3636
# 2) we can import it in setup.py for the same reason
3737
# 3) we can import it in the CLI for the same reason
38-
"""
38+
"""
3939

4040
# Save the build number to the build.py file
4141
with open("opteryx/__version__.py", "w") as f:

opteryx/__version__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1650
4+
__build__ = 1651
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1650"
6+
__version__ = "0.26.0-beta.1651"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py
1010
# 2) we can import it in setup.py for the same reason
1111
# 3) we can import it in the CLI for the same reason
12-

opteryx/connectors/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,9 @@ def connector_factory(dataset, statistics, **config):
269269
connector = _lazy_import_connector(connector)
270270
break
271271
else:
272-
if os.path.isfile(dataset):
272+
# Check if dataset is a file or contains wildcards
273+
has_wildcards = any(char in dataset for char in ['*', '?', '['])
274+
if os.path.isfile(dataset) or has_wildcards:
273275
from opteryx.connectors import file_connector
274276

275277
return file_connector.FileConnector(dataset=dataset, statistics=statistics)

opteryx/connectors/file_connector.py

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
dataset name in a query.
99
"""
1010

11+
import glob
1112
import mmap
1213
import os
1314
from typing import Dict
15+
from typing import List
1416
from typing import Optional
1517

1618
import pyarrow
@@ -134,23 +136,76 @@ def __init__(self, *args, **kwargs):
134136
if ".." in self.dataset or self.dataset[0] in ("\\", "/", "~"):
135137
# Don't find any datasets which look like path traversal
136138
raise DatasetNotFoundError(dataset=self.dataset)
137-
self.decoder = get_decoder(self.dataset)
139+
140+
# Check if dataset contains wildcards
141+
self.has_wildcards = any(char in self.dataset for char in ['*', '?', '['])
142+
143+
if self.has_wildcards:
144+
# Expand wildcards to get list of files
145+
self.files = self._expand_wildcards(self.dataset)
146+
if not self.files:
147+
raise DatasetNotFoundError(dataset=self.dataset)
148+
# Use the first file to determine the decoder
149+
self.decoder = get_decoder(self.files[0])
150+
else:
151+
self.files = [self.dataset]
152+
self.decoder = get_decoder(self.dataset)
153+
154+
def _expand_wildcards(self, pattern: str) -> List[str]:
155+
"""
156+
Expand wildcard patterns in file paths while preventing path traversal.
157+
158+
Supports wildcards:
159+
- * matches any number of characters
160+
- ? matches a single character
161+
- [range] matches a range of characters (e.g., [0-9], [a-z])
162+
163+
Args:
164+
pattern: File path pattern with wildcards
165+
166+
Returns:
167+
List of matching file paths
168+
"""
169+
# Additional path traversal check after expansion
170+
if ".." in pattern:
171+
raise DatasetNotFoundError(dataset=pattern)
172+
173+
# Use glob to expand the pattern
174+
matched_files = glob.glob(pattern, recursive=False)
175+
176+
# Filter out any results that might have path traversal
177+
# This is an extra safety check
178+
safe_files = []
179+
for file_path in matched_files:
180+
if ".." not in file_path and os.path.isfile(file_path):
181+
safe_files.append(file_path)
182+
183+
return sorted(safe_files)
138184

139185
def read_dataset(
140186
self, columns: list = None, predicates: list = None, limit: int = None, **kwargs
141187
) -> pyarrow.Table:
142-
morsel = read_blob(
143-
blob_name=self.dataset,
144-
decoder=self.decoder,
145-
statistics=self.statistics,
146-
projection=columns,
147-
selection=predicates,
148-
)[3]
149-
150-
if limit is not None:
151-
morsel = morsel.slice(offset=0, length=limit)
152-
153-
yield morsel
188+
rows_read = 0
189+
190+
# Iterate over all matched files
191+
for file_path in self.files:
192+
morsel = read_blob(
193+
blob_name=file_path,
194+
decoder=self.decoder,
195+
statistics=self.statistics,
196+
projection=columns,
197+
selection=predicates,
198+
)[3]
199+
200+
if limit is not None:
201+
remaining = limit - rows_read
202+
if remaining <= 0:
203+
break
204+
if morsel.num_rows > remaining:
205+
morsel = morsel.slice(offset=0, length=remaining)
206+
rows_read += morsel.num_rows
207+
208+
yield morsel
154209

155210
def get_dataset_schema(self) -> RelationSchema:
156211
"""
@@ -164,9 +219,12 @@ def get_dataset_schema(self) -> RelationSchema:
164219
if self.schema is not None:
165220
return self.schema
166221

222+
# Use the first file to get the schema
223+
first_file = self.files[0]
224+
167225
try:
168-
file_descriptor = os.open(self.dataset, os.O_RDONLY | os.O_BINARY)
169-
size = os.path.getsize(self.dataset)
226+
file_descriptor = os.open(first_file, os.O_RDONLY | os.O_BINARY)
227+
size = os.path.getsize(first_file)
170228
_map = mmap.mmap(file_descriptor, size, access=mmap.ACCESS_READ)
171229
self.schema = self.decoder(_map, just_schema=True)
172230
self.relation_statistics = self.decoder(_map, just_statistics=True)

opteryx/connectors/sql_connector.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,21 +259,26 @@ def collect_relation_stats(self) -> RelationStatistics:
259259
if row_est is not None:
260260
stats.record_count_estimate = int(row_est)
261261

262-
pg_stats = conn.execute(
263-
text("""
262+
pg_stats = (
263+
conn.execute(
264+
text("""
264265
SELECT attname, n_distinct, null_frac, histogram_bounds
265266
FROM pg_stats
266267
WHERE tablename = :t
267268
"""),
268-
{"t": table_name_only},
269-
).fetchall()
269+
{"t": table_name_only},
270+
)
271+
.mappings()
272+
.all()
273+
)
270274

271275
for row in pg_stats:
272276
col = row["attname"]
273277
stats.cardinality_estimate[col] = (
274278
int(row["n_distinct"]) if row["n_distinct"] > 0 else 0
275279
)
276-
stats.null_count[col] = int(row["null_frac"] * row_est)
280+
if row_est is not None:
281+
stats.null_count[col] = int(row["null_frac"] * row_est)
277282
bounds = row["histogram_bounds"]
278283
if bounds and isinstance(bounds, list) and len(bounds) >= 2:
279284
stats.lower_bounds[col] = bounds[0]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1650"
3+
version = "0.26.0-beta.1651"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}
10.8 MB
Binary file not shown.
10.8 MB
Binary file not shown.
10.8 MB
Binary file not shown.

0 commit comments

Comments
 (0)