mabel-dev
diff --git a/‎opteryx/connectors/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎opteryx/connectors/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎opteryx/connectors/file_connector.py‎
Lines changed: 73 additions & 15 deletions b/‎opteryx/connectors/file_connector.py‎
Lines changed: 73 additions & 15 deletions
diff --git a/‎testdata/wildcard_test/file1.parquet‎
10.8 MB b/‎testdata/wildcard_test/file1.parquet‎
10.8 MB
diff --git a/‎testdata/wildcard_test/file2.parquet‎
10.8 MB b/‎testdata/wildcard_test/file2.parquet‎
10.8 MB
diff --git a/‎testdata/wildcard_test/file3.parquet‎
10.8 MB b/‎testdata/wildcard_test/file3.parquet‎
10.8 MB
diff --git a/‎tests/integration/test_wildcard_queries.py‎
Lines changed: 73 additions & 0 deletions b/‎tests/integration/test_wildcard_queries.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎tests/unit/connectors/test_wildcard_paths.py‎
Lines changed: 157 additions & 0 deletions b/‎tests/unit/connectors/test_wildcard_paths.py‎
Lines changed: 157 additions & 0 deletions
@@ -269,7 +269,9 @@ def connector_factory(dataset, statistics, **config):
                 connector = _lazy_import_connector(connector)
             break
     else:
-        if os.path.isfile(dataset):
+        # Check if dataset is a file or contains wildcards
+        has_wildcards = any(char in dataset for char in ['*', '?', '['])
+        if os.path.isfile(dataset) or has_wildcards:
             from opteryx.connectors import file_connector
 
             return file_connector.FileConnector(dataset=dataset, statistics=statistics)
 
@@ -8,9 +8,11 @@
 dataset name in a query.
 """
 
+import glob
 import mmap
 import os
 from typing import Dict
+from typing import List
 from typing import Optional
 
 import pyarrow
@@ -134,23 +136,76 @@ def __init__(self, *args, **kwargs):
         if ".." in self.dataset or self.dataset[0] in ("\\", "/", "~"):
             # Don't find any datasets which look like path traversal
             raise DatasetNotFoundError(dataset=self.dataset)
-        self.decoder = get_decoder(self.dataset)
+        
+        # Check if dataset contains wildcards
+        self.has_wildcards = any(char in self.dataset for char in ['*', '?', '['])
+        
+        if self.has_wildcards:
+            # Expand wildcards to get list of files
+            self.files = self._expand_wildcards(self.dataset)
+            if not self.files:
+                raise DatasetNotFoundError(dataset=self.dataset)
+            # Use the first file to determine the decoder
+            self.decoder = get_decoder(self.files[0])
+        else:
+            self.files = [self.dataset]
+            self.decoder = get_decoder(self.dataset)
+    
+    def _expand_wildcards(self, pattern: str) -> List[str]:
+        """
+        Expand wildcard patterns in file paths while preventing path traversal.
+        
+        Supports wildcards:
+        - * matches any number of characters
+        - ? matches a single character  
+        - [range] matches a range of characters (e.g., [0-9], [a-z])
+        
+        Args:
+            pattern: File path pattern with wildcards
+            
+        Returns:
+            List of matching file paths
+        """
+        # Additional path traversal check after expansion
+        if ".." in pattern:
+            raise DatasetNotFoundError(dataset=pattern)
+        
+        # Use glob to expand the pattern
+        matched_files = glob.glob(pattern, recursive=False)
+        
+        # Filter out any results that might have path traversal
+        # This is an extra safety check
+        safe_files = []
+        for file_path in matched_files:
+            if ".." not in file_path and os.path.isfile(file_path):
+                safe_files.append(file_path)
+        
+        return sorted(safe_files)
 
     def read_dataset(
         self, columns: list = None, predicates: list = None, limit: int = None, **kwargs
     ) -> pyarrow.Table:
-        morsel = read_blob(
-            blob_name=self.dataset,
-            decoder=self.decoder,
-            statistics=self.statistics,
-            projection=columns,
-            selection=predicates,
-        )[3]
-
-        if limit is not None:
-            morsel = morsel.slice(offset=0, length=limit)
-
-        yield morsel
+        rows_read = 0
+        
+        # Iterate over all matched files
+        for file_path in self.files:
+            morsel = read_blob(
+                blob_name=file_path,
+                decoder=self.decoder,
+                statistics=self.statistics,
+                projection=columns,
+                selection=predicates,
+            )[3]
+
+            if limit is not None:
+                remaining = limit - rows_read
+                if remaining <= 0:
+                    break
+                if morsel.num_rows > remaining:
+                    morsel = morsel.slice(offset=0, length=remaining)
+                rows_read += morsel.num_rows
+
+            yield morsel
 
     def get_dataset_schema(self) -> RelationSchema:
         """
@@ -164,9 +219,12 @@ def get_dataset_schema(self) -> RelationSchema:
         if self.schema is not None:
             return self.schema
 
+        # Use the first file to get the schema
+        first_file = self.files[0]
+        
         try:
-            file_descriptor = os.open(self.dataset, os.O_RDONLY | os.O_BINARY)
-            size = os.path.getsize(self.dataset)
+            file_descriptor = os.open(first_file, os.O_RDONLY | os.O_BINARY)
+            size = os.path.getsize(first_file)
             _map = mmap.mmap(file_descriptor, size, access=mmap.ACCESS_READ)
             self.schema = self.decoder(_map, just_schema=True)
             self.relation_statistics = self.decoder(_map, just_statistics=True)
 
@@ -0,0 +1,73 @@
+"""
+Integration tests for wildcard support in file paths
+"""
+
+import os
+import sys
+
+sys.path.insert(1, os.path.join(sys.path[0], "../.."))
+
+import pytest
+
+# Skip this if opteryx is not properly installed
+try:
+    import opteryx
+except ImportError:
+    pytest.skip("opteryx not installed", allow_module_level=True)
+
+
+def test_wildcard_asterisk():
+    """Test SELECT with * wildcard in path"""
+    result = opteryx.query("SELECT COUNT(*) FROM 'testdata/wildcard_test/*.parquet'")
+    
+    # Should read from all 3 parquet files
+    # Each file has 100000 rows, so total should be 300000
+    count = result.arrow().column(0)[0].as_py()
+    assert count == 300000, f"Expected 300000 rows, got {count}"
+
+
+def test_wildcard_question_mark_range():
+    """Test SELECT with range wildcard [1-2] in path"""
+    result = opteryx.query("SELECT COUNT(*) FROM 'testdata/wildcard_test/file[1-2].parquet'")
+    
+    # Should read from file1 and file2 only (200000 rows total)
+    count = result.arrow().column(0)[0].as_py()
+    assert count == 200000, f"Expected 200000 rows, got {count}"
+
+
+def test_wildcard_specific_columns():
+    """Test SELECT specific columns with wildcard path"""
+    result = opteryx.query("SELECT user_name FROM 'testdata/wildcard_test/*.parquet' LIMIT 5")
+    
+    # Should return results
+    assert result.rowcount == 5
+    assert "user_name" in result.column_names
+
+
+def test_wildcard_with_where_clause():
+    """Test SELECT with WHERE clause and wildcard path"""
+    result = opteryx.query(
+        "SELECT user_name, user_verified FROM 'testdata/wildcard_test/*.parquet' "
+        "WHERE user_name ILIKE '%news%'"
+    )
+    
+    # Should read from all files and filter
+    # Original single file has 122 matching rows, so 3 files should have 366
+    assert result.rowcount == 366, f"Expected 366 rows, got {result.rowcount}"
+
+
+def test_wildcard_no_matches():
+    """Test that wildcard with no matches raises appropriate error"""
+    with pytest.raises(Exception):  # Should raise DatasetNotFoundError
+        opteryx.query("SELECT * FROM 'testdata/nonexistent/*.parquet'")
+
+
+def test_wildcard_path_traversal_blocked():
+    """Test that path traversal is blocked even with wildcards"""
+    with pytest.raises(Exception):  # Should raise DatasetNotFoundError
+        opteryx.query("SELECT * FROM '../*.parquet'")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    # Run tests
+    pytest.main([__file__, "-v"])
@@ -0,0 +1,157 @@
+"""
+Test wildcard support in file paths
+"""
+
+import os
+import sys
+import tempfile
+
+sys.path.insert(1, os.path.join(sys.path[0], "../../.."))
+
+import pytest
+
+from opteryx.connectors.file_connector import FileConnector
+from opteryx.exceptions import DatasetNotFoundError
+
+
+class MockStatistics:
+    """Mock statistics object for testing"""
+    def __init__(self):
+        self.bytes_read = 0
+
+
+def test_wildcard_detection():
+    """Test that wildcards are correctly detected"""
+    stats = MockStatistics()
+    
+    # These should be detected as wildcards
+    connector = FileConnector(dataset="path/*.parquet", statistics=stats)
+    assert connector.has_wildcards is True
+    
+    connector = FileConnector(dataset="path/file?.parquet", statistics=stats)
+    assert connector.has_wildcards is True
+    
+    connector = FileConnector(dataset="path/file[0-9].parquet", statistics=stats)
+    assert connector.has_wildcards is True
+
+
+def test_wildcard_no_matches():
+    """Test that wildcard with no matches raises DatasetNotFoundError"""
+    stats = MockStatistics()
+    
+    with pytest.raises(DatasetNotFoundError):
+        FileConnector(dataset="/nonexistent/path/*.parquet", statistics=stats)
+
+
+def test_path_traversal_protection():
+    """Test that path traversal is still blocked with wildcards"""
+    stats = MockStatistics()
+    
+    # These should raise DatasetNotFoundError due to path traversal
+    with pytest.raises(DatasetNotFoundError):
+        FileConnector(dataset="../*.parquet", statistics=stats)
+    
+    with pytest.raises(DatasetNotFoundError):
+        FileConnector(dataset="path/../../*.parquet", statistics=stats)
+    
+    with pytest.raises(DatasetNotFoundError):
+        FileConnector(dataset="~/*.parquet", statistics=stats)
+
+
+def test_wildcard_expansion():
+    """Test that wildcards are properly expanded to matching files"""
+    # Create temporary test files
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Create some test files
+        test_files = [
+            os.path.join(tmpdir, "file1.txt"),
+            os.path.join(tmpdir, "file2.txt"),
+            os.path.join(tmpdir, "file3.txt"),
+        ]
+        for f in test_files:
+            with open(f, "w") as fp:
+                fp.write("test content")
+        
+        stats = MockStatistics()
+        pattern = os.path.join(tmpdir, "*.txt")
+        
+        connector = FileConnector(dataset=pattern, statistics=stats)
+        
+        # Check that all files were found
+        assert len(connector.files) == 3
+        assert connector.has_wildcards is True
+        
+        # Check files are sorted
+        assert connector.files == sorted(test_files)
+
+
+def test_single_file_no_wildcard():
+    """Test that single files still work without wildcards"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_file = os.path.join(tmpdir, "test.txt")
+        with open(test_file, "w") as fp:
+            fp.write("test content")
+        
+        stats = MockStatistics()
+        connector = FileConnector(dataset=test_file, statistics=stats)
+        
+        assert connector.has_wildcards is False
+        assert connector.files == [test_file]
+
+
+def test_wildcard_range_pattern():
+    """Test wildcard with range patterns like [0-9]"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Create files matching a range pattern
+        test_files = []
+        for i in range(5):
+            f = os.path.join(tmpdir, f"file{i}.txt")
+            with open(f, "w") as fp:
+                fp.write("test")
+            test_files.append(f)
+        
+        # Create a file that shouldn't match
+        non_match = os.path.join(tmpdir, "fileX.txt")
+        with open(non_match, "w") as fp:
+            fp.write("test")
+        
+        stats = MockStatistics()
+        pattern = os.path.join(tmpdir, "file[0-9].txt")
+        
+        connector = FileConnector(dataset=pattern, statistics=stats)
+        
+        # Should match only files with digits
+        assert len(connector.files) == 5
+        assert all("file" in f and any(str(i) in f for i in range(5)) for f in connector.files)
+        assert non_match not in connector.files
+
+
+def test_wildcard_question_mark():
+    """Test wildcard with ? (single character match)"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Create files
+        file1 = os.path.join(tmpdir, "fileA.txt")
+        file2 = os.path.join(tmpdir, "fileB.txt")
+        file_no_match = os.path.join(tmpdir, "fileAB.txt")
+        
+        for f in [file1, file2, file_no_match]:
+            with open(f, "w") as fp:
+                fp.write("test")
+        
+        stats = MockStatistics()
+        pattern = os.path.join(tmpdir, "file?.txt")
+        
+        connector = FileConnector(dataset=pattern, statistics=stats)
+        
+        # Should match only single-character files
+        assert len(connector.files) == 2
+        assert file1 in connector.files
+        assert file2 in connector.files
+        assert file_no_match not in connector.files
+
+
+if __name__ == "__main__":  # pragma: no cover
+    import sys
+    
+    # Run tests
+    pytest.main([__file__, "-v"])