Merge pull request #2846 from mabel-dev/copilot/support-protocol-prefixes

joocer · web-flow · commit 0c3e43eb9855 · 2025-10-14T22:00:29.000+01:00
Support protocol prefixes for paths with wildcards (gs://, s3://) and permission controls
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1651
+__build__ = 1652
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1651"
+__version__ = "0.26.0-beta.1652"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/__init__.py b/opteryx/connectors/__init__.py
@@ -285,7 +285,10 @@ def connector_factory(dataset, statistics, **config):
     prefix = connector_entry.pop("prefix", "")
     remove_prefix = connector_entry.pop("remove_prefix", False)
     if prefix and remove_prefix and dataset.startswith(prefix):
-        dataset = dataset[len(prefix) + 1 :]
+        # Remove the prefix. If there's a separator (. or //) after the prefix, skip it too
+        dataset = dataset[len(prefix):]
+        if dataset.startswith(".") or dataset.startswith("//"):
+            dataset = dataset[1:] if dataset.startswith(".") else dataset[2:]
 
     return connector(dataset=dataset, statistics=statistics, **connector_entry)
 
diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py
@@ -86,19 +86,54 @@ def __init__(self, credentials=None, **kwargs):
             )
 
         self.minio = Minio(end_point, access_key, secret_key, secure=secure)
-        self.dataset = self.dataset.replace(".", OS_SEP)
+        
+        # Only convert dots to path separators if the dataset doesn't already contain slashes
+        # Dataset references like "my.dataset.table" use dots as separators
+        # File paths like "bucket/path/file.parquet" already have slashes and should not be converted
+        if OS_SEP not in self.dataset and "/" not in self.dataset:
+            self.dataset = self.dataset.replace(".", OS_SEP)
+        
+        # Check if dataset contains wildcards
+        self.has_wildcards = paths.has_wildcards(self.dataset)
+        if self.has_wildcards:
+            # For wildcards, we need to split into prefix and pattern
+            self.wildcard_prefix, self.wildcard_pattern = paths.split_wildcard_path(self.dataset)
+        else:
+            self.wildcard_prefix = None
+            self.wildcard_pattern = None
 
     @single_item_cache
     def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
-        bucket, object_path, _, _ = paths.get_parts(prefix)
+        # If we have wildcards, use the wildcard prefix for listing
+        if self.has_wildcards:
+            list_prefix = self.wildcard_prefix
+            filter_pattern = self.wildcard_pattern
+        else:
+            list_prefix = prefix
+            filter_pattern = None
+            
+        bucket, object_path, _, _ = paths.get_parts(list_prefix)
         blobs = self.minio.list_objects(bucket_name=bucket, prefix=object_path, recursive=True)
-        blobs = (
-            bucket + "/" + blob.object_name for blob in blobs if not blob.object_name.endswith("/")
-        )
-
-        return sorted(
-            blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS
-        )
+        
+        blob_list = []
+        for blob in blobs:
+            if blob.object_name.endswith("/"):
+                continue
+                
+            full_path = bucket + "/" + blob.object_name
+            
+            # Check if blob has valid extension
+            if ("." + full_path.split(".")[-1].lower()) not in VALID_EXTENSIONS:
+                continue
+            
+            # If we have a wildcard pattern, filter by it
+            if filter_pattern:
+                if paths.match_wildcard(filter_pattern, full_path):
+                    blob_list.append(full_path)
+            else:
+                blob_list.append(full_path)
+        
+        return sorted(blob_list)
 
     def read_dataset(
         self, columns: list = None, just_schema: bool = False, **kwargs
diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py
@@ -91,9 +91,24 @@ def __init__(self, credentials=None, **kwargs):
         Asynchronous.__init__(self, **kwargs)
         Statistics.__init__(self, **kwargs)
 
-        self.dataset = self.dataset.replace(".", OS_SEP)
+        # Only convert dots to path separators if the dataset doesn't already contain slashes
+        # Dataset references like "my.dataset.table" use dots as separators
+        # File paths like "bucket/path/file.parquet" already have slashes and should not be converted
+        if OS_SEP not in self.dataset and "/" not in self.dataset:
+            self.dataset = self.dataset.replace(".", OS_SEP)
         self.credentials = credentials
-        self.bucket, _, _, _ = paths.get_parts(self.dataset)
+        
+        # Check if dataset contains wildcards
+        self.has_wildcards = paths.has_wildcards(self.dataset)
+        if self.has_wildcards:
+            # For wildcards, we need to split into prefix and pattern
+            # The prefix is used for listing, pattern for filtering
+            self.wildcard_prefix, self.wildcard_pattern = paths.split_wildcard_path(self.dataset)
+            self.bucket, _, _, _ = paths.get_parts(self.wildcard_prefix or self.dataset)
+        else:
+            self.wildcard_prefix = None
+            self.wildcard_pattern = None
+            self.bucket, _, _, _ = paths.get_parts(self.dataset)
 
         # we're going to cache the first blob as the schema and dataset reader
         # sometimes both start here
@@ -181,7 +196,15 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
         if prefix in self.blob_list:
             return self.blob_list[prefix]
 
-        bucket, object_path, _, _ = paths.get_parts(prefix)
+        # If we have wildcards, use the wildcard prefix for listing
+        if self.has_wildcards:
+            list_prefix = self.wildcard_prefix
+            filter_pattern = self.wildcard_pattern
+        else:
+            list_prefix = prefix
+            filter_pattern = None
+
+        bucket, object_path, _, _ = paths.get_parts(list_prefix)
         if "kh" not in bucket:
             bucket = bucket.replace("va_data", "va-data")
             bucket = bucket.replace("data_", "data-")
@@ -204,11 +227,19 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
                 raise DatasetReadError(f"Error fetching blob list: {response.text}")
 
             blob_data = response.json()
-            blob_names.extend(
-                f"{bucket}/{name}"
-                for name in (blob["name"] for blob in blob_data.get("items", []))
-                if name.endswith(TUPLE_OF_VALID_EXTENSIONS)
-            )
+            for blob in blob_data.get("items", []):
+                name = blob["name"]
+                if not name.endswith(TUPLE_OF_VALID_EXTENSIONS):
+                    continue
+                    
+                full_path = f"{bucket}/{name}"
+                
+                # If we have a wildcard pattern, filter by it
+                if filter_pattern:
+                    if paths.match_wildcard(filter_pattern, full_path):
+                        blob_names.append(full_path)
+                else:
+                    blob_names.append(full_path)
 
             page_token = blob_data.get("nextPageToken")
             if not page_token:
diff --git a/opteryx/utils/paths.py b/opteryx/utils/paths.py
@@ -7,6 +7,7 @@
 Functions to help with handling file paths
 """
 
+import fnmatch
 import os
 
 OS_SEP = os.sep
@@ -39,3 +40,105 @@ def get_parts(path_string: str):
     parts_path = OS_SEP.join(parts)
 
     return bucket, parts_path, file_name, suffix
+
+
+def has_wildcards(path: str) -> bool:
+    """
+    Check if a path contains wildcard characters.
+    
+    Args:
+        path: Path string to check
+        
+    Returns:
+        True if path contains wildcards (*, ?, [])
+    """
+    return any(char in path for char in ['*', '?', '['])
+
+
+def split_wildcard_path(path: str):
+    """
+    Split a path with wildcards into a non-wildcard prefix and wildcard pattern.
+    
+    For cloud storage, we need to list blobs with a prefix, then filter by pattern.
+    This function finds the longest non-wildcard prefix for listing.
+    
+    Args:
+        path: Path with potential wildcards (e.g., "bucket/path/subdir/*.parquet")
+        
+    Returns:
+        tuple: (prefix, pattern) where:
+            - prefix: Non-wildcard prefix for listing (e.g., "bucket/path/subdir/")
+            - pattern: Full path with wildcards for matching (e.g., "bucket/path/subdir/*.parquet")
+    
+    Examples:
+        >>> split_wildcard_path("bucket/path/*.parquet")
+        ('bucket/path/', 'bucket/path/*.parquet')
+        
+        >>> split_wildcard_path("bucket/path/file[0-9].parquet")
+        ('bucket/path/', 'bucket/path/file[0-9].parquet')
+        
+        >>> split_wildcard_path("bucket/*/data.parquet")
+        ('bucket/', 'bucket/*/data.parquet')
+    """
+    if not has_wildcards(path):
+        return path, path
+    
+    # Find the first wildcard character
+    wildcard_pos = len(path)
+    for char in ['*', '?', '[']:
+        pos = path.find(char)
+        if pos != -1 and pos < wildcard_pos:
+            wildcard_pos = pos
+    
+    # Find the last path separator before the wildcard
+    prefix = path[:wildcard_pos]
+    last_sep = prefix.rfind(OS_SEP)
+    
+    if last_sep != -1:
+        # Include the separator in the prefix
+        prefix = path[:last_sep + 1]
+    else:
+        # No separator before wildcard, prefix is empty or bucket name
+        prefix = ""
+    
+    return prefix, path
+
+
+def match_wildcard(pattern: str, path: str) -> bool:
+    """
+    Match a path against a wildcard pattern using glob-like semantics.
+    
+    Unlike fnmatch, this function treats path separators specially:
+    - '*' matches any characters EXCEPT path separators
+    - '?' matches any single character EXCEPT path separators
+    - Use '**' to match across directory boundaries (not yet supported)
+    
+    This ensures consistent behavior with glob.glob() used for local files.
+    
+    Args:
+        pattern: Pattern with wildcards (e.g., "bucket/path/*.parquet")
+        path: Path to match (e.g., "bucket/path/file1.parquet")
+        
+    Returns:
+        True if path matches pattern
+        
+    Examples:
+        >>> match_wildcard("bucket/path/*.parquet", "bucket/path/file.parquet")
+        True
+        >>> match_wildcard("bucket/path/*.parquet", "bucket/path/sub/file.parquet")
+        False
+    """
+    # Split pattern and path into parts using OS path separator for cross-platform compatibility
+    pattern_parts = pattern.split(OS_SEP)
+    path_parts = path.split(OS_SEP)
+    
+    # Must have same number of path parts for a match (wildcards don't cross directory boundaries)
+    if len(pattern_parts) != len(path_parts):
+        return False
+    
+    # Match each part using fnmatch
+    for pattern_part, path_part in zip(pattern_parts, path_parts):
+        if not fnmatch.fnmatch(path_part, pattern_part):
+            return False
+    
+    return True
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1651"
+version = "0.26.0-beta.1652"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 import os
 import platform
 import sys
-from distutils.sysconfig import get_config_var
+from sysconfig import get_config_var
 from typing import Any
 from typing import Dict
 
diff --git a/testdata/PERMISSIONS_README.md b/testdata/PERMISSIONS_README.md
@@ -0,0 +1,106 @@
+# Protocol Prefix Permissions
+
+This directory contains example permission configurations for controlling access to different data sources in Opteryx.
+
+## permissions.json Format
+
+The `permissions.json` file contains one JSON object per line, each defining a permission rule:
+
+```json
+{"role":"role_name", "permission": "READ", "table": "pattern"}
+```
+
+- **role**: The name of the role that has this permission
+- **permission**: The type of permission (currently only "READ" is supported)
+- **table**: A pattern (supporting wildcards) that matches table names
+
+## Protocol Prefixes as Table Namespaces
+
+Protocol prefixes (`file://`, `gs://`, `s3://`) are treated as table namespaces, just like dataset namespaces (e.g., `opteryx.*`). You can control access to these protocols by adding permission entries for specific roles.
+
+### Example Configurations
+
+#### Restrict a Role to Only Dataset Access (No Cloud Storage)
+```json
+{"role":"restricted", "permission": "READ", "table": "opteryx.*"}
+```
+Users with the `restricted` role can only access tables in the `opteryx.*` namespace, but cannot access `file://`, `gs://`, or `s3://` paths.
+
+#### Grant a Role Access to Dataset and GCS
+```json
+{"role":"data_analyst", "permission": "READ", "table": "opteryx.*"}
+{"role":"data_analyst", "permission": "READ", "table": "gs://*"}
+```
+Users with the `data_analyst` role can access both `opteryx.*` tables and any `gs://` paths.
+
+#### Grant a Role Access to All Cloud Protocols
+```json
+{"role":"data_engineer", "permission": "READ", "table": "opteryx.*"}
+{"role":"data_engineer", "permission": "READ", "table": "file://*"}
+{"role":"data_engineer", "permission": "READ", "table": "gs://*"}
+{"role":"data_engineer", "permission": "READ", "table": "s3://*"}
+```
+Users with the `data_engineer` role can access all data sources.
+
+#### Grant a Role Access to Specific GCS Buckets
+```json
+{"role":"project_team", "permission": "READ", "table": "gs://project-bucket/*"}
+```
+Users with the `project_team` role can only access paths in the `gs://project-bucket/` bucket.
+
+## Default Access
+
+The system includes a default role `opteryx` with wildcard access to everything:
+```json
+{"role":"opteryx", "permission": "READ", "table": "*"}
+```
+This is added automatically and cannot be overridden by the permissions.json file.
+
+## Usage in Queries
+
+When you query using protocol prefixes, the permission system checks if your role has access to that table pattern:
+
+```sql
+-- Requires a role with permission for "gs://*" pattern
+SELECT * FROM gs://my-bucket/data/*.parquet
+
+-- Requires a role with permission for "s3://*" pattern
+SELECT * FROM s3://my-bucket/logs/2024-01-??.csv
+
+-- Requires a role with permission for "file://*" pattern
+SELECT * FROM file://path/to/data/*.csv
+
+-- Requires a role with permission for "opteryx.*" pattern
+SELECT * FROM opteryx.space_missions
+```
+
+## Multiple Roles
+
+Users can have multiple roles. If any role grants access to a table pattern, the user can access it:
+
+```sql
+-- User with roles ["restricted", "cloud_user"] where:
+-- - "restricted" has permission for "opteryx.*"
+-- - "cloud_user" has permission for "gs://*"
+
+-- ✓ Allowed - restricted role grants access
+SELECT * FROM opteryx.space_missions
+
+-- ✓ Allowed - cloud_user role grants access  
+SELECT * FROM gs://bucket/data/*.parquet
+
+-- ✗ Denied - no role grants access
+SELECT * FROM s3://bucket/data/*.parquet
+```
+
+## Security Best Practices
+
+1. **Least Privilege**: Only grant the minimum permissions needed for each role
+2. **Namespace Separation**: Use table patterns to restrict access to specific namespaces or buckets
+3. **Protocol Control**: Explicitly grant or deny protocol access (file://, gs://, s3://) per role
+4. **Monitor Access**: Log and review which roles access which data sources
+5. **Audit Regularly**: Review and update permissions as access requirements change
+
+## Testing
+
+See `tests/unit/security/test_protocol_permissions.py` for comprehensive tests of the protocol prefix permission system.
diff --git a/tests/unit/connectors/test_protocol_prefix_paths.py b/tests/unit/connectors/test_protocol_prefix_paths.py
diff --git a/tests/unit/security/test_protocol_permissions.py b/tests/unit/security/test_protocol_permissions.py
diff --git a/tests/unit/utils/test_paths_wildcards.py b/tests/unit/utils/test_paths_wildcards.py