mabel-dev
diff --git a/‎README.md‎
Lines changed: 31 additions & 0 deletions b/‎README.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion b/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opteryx/__version__.py‎
Lines changed: 2 additions & 3 deletions b/‎opteryx/__version__.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎opteryx/connectors/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎opteryx/connectors/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎opteryx/connectors/file_connector.py‎
Lines changed: 73 additions & 15 deletions b/‎opteryx/connectors/file_connector.py‎
Lines changed: 73 additions & 15 deletions
diff --git a/‎opteryx/connectors/sql_connector.py‎
Lines changed: 10 additions & 5 deletions b/‎opteryx/connectors/sql_connector.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎testdata/wildcard_test/file1.parquet‎
10.8 MB b/‎testdata/wildcard_test/file1.parquet‎
10.8 MB
diff --git a/‎testdata/wildcard_test/file2.parquet‎
10.8 MB b/‎testdata/wildcard_test/file2.parquet‎
10.8 MB
diff --git a/‎testdata/wildcard_test/file3.parquet‎
10.8 MB b/‎testdata/wildcard_test/file3.parquet‎
10.8 MB
@@ -201,6 +201,37 @@ _this example requires a data file, [space_missions.parquet](https://storage.goo
 
 </details>
 
+<details>
+<summary>Query Multiple Files with Wildcards</summary>
+
+In this example, we are querying multiple files using wildcard patterns. Opteryx supports `*` (any characters), `?` (single character), and `[range]` patterns in file paths.
+
+~~~python
+# Import the Opteryx query engine.
+import opteryx
+
+# Execute a SQL query to select data from all parquet files in a directory.
+# The wildcard '*' matches any characters in the filename.
+result = opteryx.query("SELECT * FROM 'data/*.parquet' LIMIT 10;")
+
+# Display the result.
+result.head()
+~~~
+
+You can also use more specific patterns:
+
+~~~python
+# Query files matching a range pattern, e.g., file1.parquet through file9.parquet
+result = opteryx.query("SELECT COUNT(*) FROM 'data/file[1-9].parquet';")
+
+# Query files with specific naming patterns
+result = opteryx.query("SELECT * FROM 'logs/2024-01-*.jsonl';")
+~~~
+
+_Wildcards work with all supported file formats (Parquet, JSONL, CSV, etc.) and prevent path traversal for security._
+
+</details>
+
 <details>
 <summary>Query Data in SQLite</summary>
 
 
@@ -35,7 +35,7 @@ class VersionStatus(Enum):
 # 1) we don't load dependencies by storing it in __init__.py
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it in the CLI for the same reason
-    """
+"""
 
     # Save the build number to the build.py file
     with open("opteryx/__version__.py", "w") as f:
 
@@ -1,12 +1,11 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1650
+__build__ = 1651
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1650"
+__version__ = "0.26.0-beta.1651"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 # 2) we can import it in setup.py for the same reason
 # 3) we can import it in the CLI for the same reason
-    
@@ -269,7 +269,9 @@ def connector_factory(dataset, statistics, **config):
                 connector = _lazy_import_connector(connector)
             break
     else:
-        if os.path.isfile(dataset):
+        # Check if dataset is a file or contains wildcards
+        has_wildcards = any(char in dataset for char in ['*', '?', '['])
+        if os.path.isfile(dataset) or has_wildcards:
             from opteryx.connectors import file_connector
 
             return file_connector.FileConnector(dataset=dataset, statistics=statistics)
 
@@ -8,9 +8,11 @@
 dataset name in a query.
 """
 
+import glob
 import mmap
 import os
 from typing import Dict
+from typing import List
 from typing import Optional
 
 import pyarrow
@@ -134,23 +136,76 @@ def __init__(self, *args, **kwargs):
         if ".." in self.dataset or self.dataset[0] in ("\\", "/", "~"):
             # Don't find any datasets which look like path traversal
             raise DatasetNotFoundError(dataset=self.dataset)
-        self.decoder = get_decoder(self.dataset)
+        
+        # Check if dataset contains wildcards
+        self.has_wildcards = any(char in self.dataset for char in ['*', '?', '['])
+        
+        if self.has_wildcards:
+            # Expand wildcards to get list of files
+            self.files = self._expand_wildcards(self.dataset)
+            if not self.files:
+                raise DatasetNotFoundError(dataset=self.dataset)
+            # Use the first file to determine the decoder
+            self.decoder = get_decoder(self.files[0])
+        else:
+            self.files = [self.dataset]
+            self.decoder = get_decoder(self.dataset)
+    
+    def _expand_wildcards(self, pattern: str) -> List[str]:
+        """
+        Expand wildcard patterns in file paths while preventing path traversal.
+        
+        Supports wildcards:
+        - * matches any number of characters
+        - ? matches a single character  
+        - [range] matches a range of characters (e.g., [0-9], [a-z])
+        
+        Args:
+            pattern: File path pattern with wildcards
+            
+        Returns:
+            List of matching file paths
+        """
+        # Additional path traversal check after expansion
+        if ".." in pattern:
+            raise DatasetNotFoundError(dataset=pattern)
+        
+        # Use glob to expand the pattern
+        matched_files = glob.glob(pattern, recursive=False)
+        
+        # Filter out any results that might have path traversal
+        # This is an extra safety check
+        safe_files = []
+        for file_path in matched_files:
+            if ".." not in file_path and os.path.isfile(file_path):
+                safe_files.append(file_path)
+        
+        return sorted(safe_files)
 
     def read_dataset(
         self, columns: list = None, predicates: list = None, limit: int = None, **kwargs
     ) -> pyarrow.Table:
-        morsel = read_blob(
-            blob_name=self.dataset,
-            decoder=self.decoder,
-            statistics=self.statistics,
-            projection=columns,
-            selection=predicates,
-        )[3]
-
-        if limit is not None:
-            morsel = morsel.slice(offset=0, length=limit)
-
-        yield morsel
+        rows_read = 0
+        
+        # Iterate over all matched files
+        for file_path in self.files:
+            morsel = read_blob(
+                blob_name=file_path,
+                decoder=self.decoder,
+                statistics=self.statistics,
+                projection=columns,
+                selection=predicates,
+            )[3]
+
+            if limit is not None:
+                remaining = limit - rows_read
+                if remaining <= 0:
+                    break
+                if morsel.num_rows > remaining:
+                    morsel = morsel.slice(offset=0, length=remaining)
+                rows_read += morsel.num_rows
+
+            yield morsel
 
     def get_dataset_schema(self) -> RelationSchema:
         """
@@ -164,9 +219,12 @@ def get_dataset_schema(self) -> RelationSchema:
         if self.schema is not None:
             return self.schema
 
+        # Use the first file to get the schema
+        first_file = self.files[0]
+        
         try:
-            file_descriptor = os.open(self.dataset, os.O_RDONLY | os.O_BINARY)
-            size = os.path.getsize(self.dataset)
+            file_descriptor = os.open(first_file, os.O_RDONLY | os.O_BINARY)
+            size = os.path.getsize(first_file)
             _map = mmap.mmap(file_descriptor, size, access=mmap.ACCESS_READ)
             self.schema = self.decoder(_map, just_schema=True)
             self.relation_statistics = self.decoder(_map, just_statistics=True)
 
@@ -259,21 +259,26 @@ def collect_relation_stats(self) -> RelationStatistics:
                 if row_est is not None:
                     stats.record_count_estimate = int(row_est)
 
-                pg_stats = conn.execute(
-                    text("""
+                pg_stats = (
+                    conn.execute(
+                        text("""
                     SELECT attname, n_distinct, null_frac, histogram_bounds
                     FROM pg_stats
                     WHERE tablename = :t
                 """),
-                    {"t": table_name_only},
-                ).fetchall()
+                        {"t": table_name_only},
+                    )
+                    .mappings()
+                    .all()
+                )
 
             for row in pg_stats:
                 col = row["attname"]
                 stats.cardinality_estimate[col] = (
                     int(row["n_distinct"]) if row["n_distinct"] > 0 else 0
                 )
-                stats.null_count[col] = int(row["null_frac"] * row_est)
+                if row_est is not None:
+                    stats.null_count[col] = int(row["null_frac"] * row_est)
                 bounds = row["histogram_bounds"]
                 if bounds and isinstance(bounds, list) and len(bounds) >= 2:
                     stats.lower_bounds[col] = bounds[0]
 
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1650"
+version = "0.26.0-beta.1651"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}