Skip to content

Commit 0c3e43e

Browse files
authored
Merge pull request #2846 from mabel-dev/copilot/support-protocol-prefixes
Support protocol prefixes for paths with wildcards (gs://, s3://) and permission controls
2 parents 9214c45 + 88d8a0e commit 0c3e43e

File tree

11 files changed

+760
-22
lines changed

11 files changed

+760
-22
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1651
4+
__build__ = 1652
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1651"
6+
__version__ = "0.26.0-beta.1652"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,10 @@ def connector_factory(dataset, statistics, **config):
285285
prefix = connector_entry.pop("prefix", "")
286286
remove_prefix = connector_entry.pop("remove_prefix", False)
287287
if prefix and remove_prefix and dataset.startswith(prefix):
288-
dataset = dataset[len(prefix) + 1 :]
288+
# Remove the prefix. If there's a separator (. or //) after the prefix, skip it too
289+
dataset = dataset[len(prefix):]
290+
if dataset.startswith(".") or dataset.startswith("//"):
291+
dataset = dataset[1:] if dataset.startswith(".") else dataset[2:]
289292

290293
return connector(dataset=dataset, statistics=statistics, **connector_entry)
291294

opteryx/connectors/aws_s3_connector.py

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,54 @@ def __init__(self, credentials=None, **kwargs):
8686
)
8787

8888
self.minio = Minio(end_point, access_key, secret_key, secure=secure)
89-
self.dataset = self.dataset.replace(".", OS_SEP)
89+
90+
# Only convert dots to path separators if the dataset doesn't already contain slashes
91+
# Dataset references like "my.dataset.table" use dots as separators
92+
# File paths like "bucket/path/file.parquet" already have slashes and should not be converted
93+
if OS_SEP not in self.dataset and "/" not in self.dataset:
94+
self.dataset = self.dataset.replace(".", OS_SEP)
95+
96+
# Check if dataset contains wildcards
97+
self.has_wildcards = paths.has_wildcards(self.dataset)
98+
if self.has_wildcards:
99+
# For wildcards, we need to split into prefix and pattern
100+
self.wildcard_prefix, self.wildcard_pattern = paths.split_wildcard_path(self.dataset)
101+
else:
102+
self.wildcard_prefix = None
103+
self.wildcard_pattern = None
90104

91105
@single_item_cache
92106
def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
93-
bucket, object_path, _, _ = paths.get_parts(prefix)
107+
# If we have wildcards, use the wildcard prefix for listing
108+
if self.has_wildcards:
109+
list_prefix = self.wildcard_prefix
110+
filter_pattern = self.wildcard_pattern
111+
else:
112+
list_prefix = prefix
113+
filter_pattern = None
114+
115+
bucket, object_path, _, _ = paths.get_parts(list_prefix)
94116
blobs = self.minio.list_objects(bucket_name=bucket, prefix=object_path, recursive=True)
95-
blobs = (
96-
bucket + "/" + blob.object_name for blob in blobs if not blob.object_name.endswith("/")
97-
)
98-
99-
return sorted(
100-
blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS
101-
)
117+
118+
blob_list = []
119+
for blob in blobs:
120+
if blob.object_name.endswith("/"):
121+
continue
122+
123+
full_path = bucket + "/" + blob.object_name
124+
125+
# Check if blob has valid extension
126+
if ("." + full_path.split(".")[-1].lower()) not in VALID_EXTENSIONS:
127+
continue
128+
129+
# If we have a wildcard pattern, filter by it
130+
if filter_pattern:
131+
if paths.match_wildcard(filter_pattern, full_path):
132+
blob_list.append(full_path)
133+
else:
134+
blob_list.append(full_path)
135+
136+
return sorted(blob_list)
102137

103138
def read_dataset(
104139
self, columns: list = None, just_schema: bool = False, **kwargs

opteryx/connectors/gcp_cloudstorage_connector.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,24 @@ def __init__(self, credentials=None, **kwargs):
9191
Asynchronous.__init__(self, **kwargs)
9292
Statistics.__init__(self, **kwargs)
9393

94-
self.dataset = self.dataset.replace(".", OS_SEP)
94+
# Only convert dots to path separators if the dataset doesn't already contain slashes
95+
# Dataset references like "my.dataset.table" use dots as separators
96+
# File paths like "bucket/path/file.parquet" already have slashes and should not be converted
97+
if OS_SEP not in self.dataset and "/" not in self.dataset:
98+
self.dataset = self.dataset.replace(".", OS_SEP)
9599
self.credentials = credentials
96-
self.bucket, _, _, _ = paths.get_parts(self.dataset)
100+
101+
# Check if dataset contains wildcards
102+
self.has_wildcards = paths.has_wildcards(self.dataset)
103+
if self.has_wildcards:
104+
# For wildcards, we need to split into prefix and pattern
105+
# The prefix is used for listing, pattern for filtering
106+
self.wildcard_prefix, self.wildcard_pattern = paths.split_wildcard_path(self.dataset)
107+
self.bucket, _, _, _ = paths.get_parts(self.wildcard_prefix or self.dataset)
108+
else:
109+
self.wildcard_prefix = None
110+
self.wildcard_pattern = None
111+
self.bucket, _, _, _ = paths.get_parts(self.dataset)
97112

98113
# we're going to cache the first blob as the schema and dataset reader
99114
# sometimes both start here
@@ -181,7 +196,15 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
181196
if prefix in self.blob_list:
182197
return self.blob_list[prefix]
183198

184-
bucket, object_path, _, _ = paths.get_parts(prefix)
199+
# If we have wildcards, use the wildcard prefix for listing
200+
if self.has_wildcards:
201+
list_prefix = self.wildcard_prefix
202+
filter_pattern = self.wildcard_pattern
203+
else:
204+
list_prefix = prefix
205+
filter_pattern = None
206+
207+
bucket, object_path, _, _ = paths.get_parts(list_prefix)
185208
if "kh" not in bucket:
186209
bucket = bucket.replace("va_data", "va-data")
187210
bucket = bucket.replace("data_", "data-")
@@ -204,11 +227,19 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
204227
raise DatasetReadError(f"Error fetching blob list: {response.text}")
205228

206229
blob_data = response.json()
207-
blob_names.extend(
208-
f"{bucket}/{name}"
209-
for name in (blob["name"] for blob in blob_data.get("items", []))
210-
if name.endswith(TUPLE_OF_VALID_EXTENSIONS)
211-
)
230+
for blob in blob_data.get("items", []):
231+
name = blob["name"]
232+
if not name.endswith(TUPLE_OF_VALID_EXTENSIONS):
233+
continue
234+
235+
full_path = f"{bucket}/{name}"
236+
237+
# If we have a wildcard pattern, filter by it
238+
if filter_pattern:
239+
if paths.match_wildcard(filter_pattern, full_path):
240+
blob_names.append(full_path)
241+
else:
242+
blob_names.append(full_path)
212243

213244
page_token = blob_data.get("nextPageToken")
214245
if not page_token:

opteryx/utils/paths.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Functions to help with handling file paths
88
"""
99

10+
import fnmatch
1011
import os
1112

1213
OS_SEP = os.sep
@@ -39,3 +40,105 @@ def get_parts(path_string: str):
3940
parts_path = OS_SEP.join(parts)
4041

4142
return bucket, parts_path, file_name, suffix
43+
44+
45+
def has_wildcards(path: str) -> bool:
46+
"""
47+
Check if a path contains wildcard characters.
48+
49+
Args:
50+
path: Path string to check
51+
52+
Returns:
53+
True if path contains wildcards (*, ?, [])
54+
"""
55+
return any(char in path for char in ['*', '?', '['])
56+
57+
58+
def split_wildcard_path(path: str):
59+
"""
60+
Split a path with wildcards into a non-wildcard prefix and wildcard pattern.
61+
62+
For cloud storage, we need to list blobs with a prefix, then filter by pattern.
63+
This function finds the longest non-wildcard prefix for listing.
64+
65+
Args:
66+
path: Path with potential wildcards (e.g., "bucket/path/subdir/*.parquet")
67+
68+
Returns:
69+
tuple: (prefix, pattern) where:
70+
- prefix: Non-wildcard prefix for listing (e.g., "bucket/path/subdir/")
71+
- pattern: Full path with wildcards for matching (e.g., "bucket/path/subdir/*.parquet")
72+
73+
Examples:
74+
>>> split_wildcard_path("bucket/path/*.parquet")
75+
('bucket/path/', 'bucket/path/*.parquet')
76+
77+
>>> split_wildcard_path("bucket/path/file[0-9].parquet")
78+
('bucket/path/', 'bucket/path/file[0-9].parquet')
79+
80+
>>> split_wildcard_path("bucket/*/data.parquet")
81+
('bucket/', 'bucket/*/data.parquet')
82+
"""
83+
if not has_wildcards(path):
84+
return path, path
85+
86+
# Find the first wildcard character
87+
wildcard_pos = len(path)
88+
for char in ['*', '?', '[']:
89+
pos = path.find(char)
90+
if pos != -1 and pos < wildcard_pos:
91+
wildcard_pos = pos
92+
93+
# Find the last path separator before the wildcard
94+
prefix = path[:wildcard_pos]
95+
last_sep = prefix.rfind(OS_SEP)
96+
97+
if last_sep != -1:
98+
# Include the separator in the prefix
99+
prefix = path[:last_sep + 1]
100+
else:
101+
# No separator before wildcard, prefix is empty or bucket name
102+
prefix = ""
103+
104+
return prefix, path
105+
106+
107+
def match_wildcard(pattern: str, path: str) -> bool:
108+
"""
109+
Match a path against a wildcard pattern using glob-like semantics.
110+
111+
Unlike fnmatch, this function treats path separators specially:
112+
- '*' matches any characters EXCEPT path separators
113+
- '?' matches any single character EXCEPT path separators
114+
- Use '**' to match across directory boundaries (not yet supported)
115+
116+
This ensures consistent behavior with glob.glob() used for local files.
117+
118+
Args:
119+
pattern: Pattern with wildcards (e.g., "bucket/path/*.parquet")
120+
path: Path to match (e.g., "bucket/path/file1.parquet")
121+
122+
Returns:
123+
True if path matches pattern
124+
125+
Examples:
126+
>>> match_wildcard("bucket/path/*.parquet", "bucket/path/file.parquet")
127+
True
128+
>>> match_wildcard("bucket/path/*.parquet", "bucket/path/sub/file.parquet")
129+
False
130+
"""
131+
# Split pattern and path into parts using OS path separator for cross-platform compatibility
132+
pattern_parts = pattern.split(OS_SEP)
133+
path_parts = path.split(OS_SEP)
134+
135+
# Must have same number of path parts for a match (wildcards don't cross directory boundaries)
136+
if len(pattern_parts) != len(path_parts):
137+
return False
138+
139+
# Match each part using fnmatch
140+
for pattern_part, path_part in zip(pattern_parts, path_parts):
141+
if not fnmatch.fnmatch(path_part, pattern_part):
142+
return False
143+
144+
return True

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1651"
3+
version = "0.26.0-beta.1652"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import os
1212
import platform
1313
import sys
14-
from distutils.sysconfig import get_config_var
14+
from sysconfig import get_config_var
1515
from typing import Any
1616
from typing import Dict
1717

testdata/PERMISSIONS_README.md

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Protocol Prefix Permissions
2+
3+
This directory contains example permission configurations for controlling access to different data sources in Opteryx.
4+
5+
## permissions.json Format
6+
7+
The `permissions.json` file contains one JSON object per line, each defining a permission rule:
8+
9+
```json
10+
{"role":"role_name", "permission": "READ", "table": "pattern"}
11+
```
12+
13+
- **role**: The name of the role that has this permission
14+
- **permission**: The type of permission (currently only "READ" is supported)
15+
- **table**: A pattern (supporting wildcards) that matches table names
16+
17+
## Protocol Prefixes as Table Namespaces
18+
19+
Protocol prefixes (`file://`, `gs://`, `s3://`) are treated as table namespaces, just like dataset namespaces (e.g., `opteryx.*`). You can control access to these protocols by adding permission entries for specific roles.
20+
21+
### Example Configurations
22+
23+
#### Restrict a Role to Only Dataset Access (No Cloud Storage)
24+
```json
25+
{"role":"restricted", "permission": "READ", "table": "opteryx.*"}
26+
```
27+
Users with the `restricted` role can only access tables in the `opteryx.*` namespace, but cannot access `file://`, `gs://`, or `s3://` paths.
28+
29+
#### Grant a Role Access to Dataset and GCS
30+
```json
31+
{"role":"data_analyst", "permission": "READ", "table": "opteryx.*"}
32+
{"role":"data_analyst", "permission": "READ", "table": "gs://*"}
33+
```
34+
Users with the `data_analyst` role can access both `opteryx.*` tables and any `gs://` paths.
35+
36+
#### Grant a Role Access to All Cloud Protocols
37+
```json
38+
{"role":"data_engineer", "permission": "READ", "table": "opteryx.*"}
39+
{"role":"data_engineer", "permission": "READ", "table": "file://*"}
40+
{"role":"data_engineer", "permission": "READ", "table": "gs://*"}
41+
{"role":"data_engineer", "permission": "READ", "table": "s3://*"}
42+
```
43+
Users with the `data_engineer` role can access all data sources.
44+
45+
#### Grant a Role Access to Specific GCS Buckets
46+
```json
47+
{"role":"project_team", "permission": "READ", "table": "gs://project-bucket/*"}
48+
```
49+
Users with the `project_team` role can only access paths in the `gs://project-bucket/` bucket.
50+
51+
## Default Access
52+
53+
The system includes a default role `opteryx` with wildcard access to everything:
54+
```json
55+
{"role":"opteryx", "permission": "READ", "table": "*"}
56+
```
57+
This is added automatically and cannot be overridden by the permissions.json file.
58+
59+
## Usage in Queries
60+
61+
When you query using protocol prefixes, the permission system checks if your role has access to that table pattern:
62+
63+
```sql
64+
-- Requires a role with permission for "gs://*" pattern
65+
SELECT * FROM gs://my-bucket/data/*.parquet
66+
67+
-- Requires a role with permission for "s3://*" pattern
68+
SELECT * FROM s3://my-bucket/logs/2024-01-??.csv
69+
70+
-- Requires a role with permission for "file://*" pattern
71+
SELECT * FROM file://path/to/data/*.csv
72+
73+
-- Requires a role with permission for "opteryx.*" pattern
74+
SELECT * FROM opteryx.space_missions
75+
```
76+
77+
## Multiple Roles
78+
79+
Users can have multiple roles. If any role grants access to a table pattern, the user can access it:
80+
81+
```sql
82+
-- User with roles ["restricted", "cloud_user"] where:
83+
-- - "restricted" has permission for "opteryx.*"
84+
-- - "cloud_user" has permission for "gs://*"
85+
86+
-- ✓ Allowed - restricted role grants access
87+
SELECT * FROM opteryx.space_missions
88+
89+
-- ✓ Allowed - cloud_user role grants access
90+
SELECT * FROM gs://bucket/data/*.parquet
91+
92+
-- ✗ Denied - no role grants access
93+
SELECT * FROM s3://bucket/data/*.parquet
94+
```
95+
96+
## Security Best Practices
97+
98+
1. **Least Privilege**: Only grant the minimum permissions needed for each role
99+
2. **Namespace Separation**: Use table patterns to restrict access to specific namespaces or buckets
100+
3. **Protocol Control**: Explicitly grant or deny protocol access (file://, gs://, s3://) per role
101+
4. **Monitor Access**: Log and review which roles access which data sources
102+
5. **Audit Regularly**: Review and update permissions as access requirements change
103+
104+
## Testing
105+
106+
See `tests/unit/security/test_protocol_permissions.py` for comprehensive tests of the protocol prefix permission system.

0 commit comments

Comments
 (0)