Skip to content

Commit 5cc967d

Browse files
authored
Merge pull request #66 from mabel-dev/0.5.6
plan/exec sep, phase 1
2 parents d5ada20 + ce4c882 commit 5cc967d

File tree

18 files changed

+927
-200
lines changed

18 files changed

+927
-200
lines changed

dev/build_counter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class VersionStatus(Enum):
2929

3030
__major_version__ = 0
3131
__minor_version__ = 5
32-
__revision_version__ = 8
32+
__revision_version__ = 9
3333
__author__ = "@joocer"
3434
__status__ = VersionStatus.RELEASE
3535

opteryx/__version__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 137
4+
__build__ = 139
55
__author__ = "@joocer"
6-
__version__ = "0.5.8"
6+
__version__ = "0.5.9"
77
__lib__ = "opteryx-core"
8-
__build_date__ = "2026-01-03T20:41:15.799632+00:00Z"
8+
__build_date__ = "2026-01-04T16:13:49.116825+00:00Z"
99

1010
# Store the version here so:
1111
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/filesystem_connector.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from concurrent.futures import wait
1212
from threading import Lock
1313
from typing import Dict
14-
from typing import List
1514
from typing import Optional
1615
from typing import Tuple
1716

@@ -108,6 +107,26 @@ def get_executor(self):
108107
)
109108
return FileSystemTable._executor
110109

110+
def get_list_of_blob_names(self, prefix: str, predicates=None):
111+
"""
112+
Get list of blob names (file paths) matching the prefix.
113+
114+
Args:
115+
prefix: Directory/path prefix to list files from
116+
predicates: Optional predicates (not used for file listing)
117+
118+
Returns:
119+
List of file paths
120+
"""
121+
from pyarrow.fs import FileSelector
122+
123+
# Create file selector to list files recursively
124+
selector = FileSelector(prefix, recursive=True)
125+
file_infos = self.filesystem.get_file_info(selector)
126+
127+
# Extract paths from FileInfo objects
128+
return [info.path for info in file_infos]
129+
111130
def read_blob(
112131
self, *, blob_name: str, decoder, just_schema=False, projection=None, selection=None
113132
):
@@ -185,32 +204,6 @@ def blocking_read():
185204
telemetry.bytes_read += len(data)
186205
return ref
187206

188-
def get_list_of_blob_names(self, *, prefix: str, predicates: list = []) -> List[str]:
189-
"""
190-
List all blobs matching the prefix.
191-
192-
Args:
193-
prefix: Path prefix to search
194-
predicates: Optional predicates for filtering (subclasses may use this)
195-
196-
Returns:
197-
List of blob paths
198-
"""
199-
from pyarrow.fs import FileSelector
200-
201-
# Use filesystem's file listing
202-
selector = FileSelector(prefix, recursive=True)
203-
file_infos = self.filesystem.get_file_info(selector)
204-
205-
# Filter for valid file extensions
206-
blob_names = [
207-
info.path
208-
for info in file_infos
209-
if info.is_file and info.path.endswith(TUPLE_OF_VALID_EXTENSIONS)
210-
]
211-
212-
return blob_names
213-
214207
def read_dataset(
215208
self,
216209
columns: list = None,

opteryx/connectors/io_systems/__init__.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,44 @@
1313
"OpteryxLocalFileSystem",
1414
"OpteryxGcsFileSystem",
1515
"OpteryxS3FileSystem",
16+
"create_filesystem",
1617
]
18+
19+
20+
def create_filesystem(protocol: str):
21+
"""
22+
Factory function to instantiate appropriate filesystem based on protocol.
23+
24+
Used by execution operators to create filesystem from file path protocol prefix.
25+
This enables generic execution that works across all storage types.
26+
27+
Args:
28+
protocol: Protocol string from file path (e.g., "gs", "s3", "file")
29+
30+
Returns:
31+
Appropriate filesystem instance
32+
33+
Raises:
34+
ValueError: If protocol is not supported
35+
36+
Example:
37+
>>> protocol = "gs" # from "gs://bucket/file.parquet"
38+
>>> fs = create_filesystem(protocol)
39+
>>> # fs is an OpteryxGcsFileSystem instance
40+
"""
41+
protocol_map = {
42+
"gs": OpteryxGcsFileSystem,
43+
"gcs": OpteryxGcsFileSystem,
44+
"s3": OpteryxS3FileSystem,
45+
"file": OpteryxLocalFileSystem,
46+
"": OpteryxLocalFileSystem, # No protocol = local file
47+
}
48+
49+
if protocol not in protocol_map:
50+
raise ValueError(
51+
f"Unsupported storage protocol: {protocol}. "
52+
f"Supported protocols: {list(protocol_map.keys())}"
53+
)
54+
55+
filesystem_class = protocol_map[protocol]
56+
return filesystem_class()

opteryx/connectors/io_systems/gcs_filesystem.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,9 @@ def open_input_file(self, path: str):
148148
async def async_read_blob(self, *, blob_name, pool, session, statistics, **kwargs):
149149
import asyncio
150150

151-
from opteryx import system_statistics
151+
from opteryx import system_telemetry as system_statistics
152152
from opteryx.utils import paths
153153

154-
print("async read blob:", blob_name)
155-
156154
# strip gs:// prefix
157155
if blob_name.startswith("gs://"):
158156
blob_name = blob_name[5:]

0 commit comments

Comments
 (0)