Skip to content

Commit f945e71

Browse files
committed
start up and INSTR performance
1 parent cca1cc9 commit f945e71

File tree

11 files changed

+263
-100
lines changed

11 files changed

+263
-100
lines changed

opteryx/__init__.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@
2525
from pathlib import Path
2626

2727
from decimal import getcontext
28-
from typing import Optional, Union, Dict, Any, List
28+
from typing import Optional, Union, Dict, Any, List, TYPE_CHECKING
2929

30-
import pyarrow
30+
if TYPE_CHECKING: # pragma: no cover - only for type checkers
31+
import pyarrow
3132

3233
# Set Decimal precision to 28 globally
3334
getcontext().prec = 28
@@ -51,16 +52,19 @@ def is_mac() -> bool: # pragma: no cover
5152

5253

5354
# python-dotenv allows us to create an environment file to store secrets.
54-
try:
55-
import dotenv # type:ignore
55+
# Only try to import dotenv if a .env file exists to avoid paying the
56+
# import cost when no environment file is present.
57+
_env_path = Path.cwd() / ".env"
58+
if _env_path.exists():
59+
try:
60+
import dotenv # type:ignore
5661

57-
_env_path = Path.cwd() / ".env"
58-
if _env_path.exists() and dotenv is not None:
5962
dotenv.load_dotenv(dotenv_path=_env_path)
6063
if OPTERYX_DEBUG:
6164
print(f"{datetime.datetime.now()} [LOADER] Loading `.env` file.")
62-
except ImportError: # pragma: no cover
63-
pass
65+
except ImportError: # pragma: no cover
66+
# dotenv is optional; if it's not installed, just continue.
67+
pass
6468

6569

6670
if OPTERYX_DEBUG: # pragma: no cover
@@ -179,7 +183,7 @@ def query_to_arrow(
179183
visibility_filters: Optional[Dict[str, Any]] = None,
180184
limit: int = None,
181185
**kwargs,
182-
) -> pyarrow.Table:
186+
) -> "pyarrow.Table":
183187
"""
184188
Helper function to execute a query and return a pyarrow Table.
185189

opteryx/__main__.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
import argparse
1414
import os
15-
import readline
1615
import sys
1716
import threading
1817
import time
@@ -25,10 +24,6 @@
2524
from opteryx.utils.sql import clean_statement
2625
from opteryx.utils.sql import remove_comments
2726

28-
29-
if readline:
30-
pass
31-
3227
# Define ANSI color codes
3328
ANSI_RED = "\u001b[31m"
3429
ANSI_RESET = "\u001b[0m"
@@ -209,6 +204,7 @@ def main():
209204

210205

211206
if __name__ == "__main__":
207+
import readline # pragma: no cover
212208
try:
213209
main()
214210
except Exception as e:

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1685
4+
__build__ = 1688
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1685"
6+
__version__ = "0.26.0-beta.1688"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/list_ops/list_in_string.pyx

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ numpy.import_array()
3333

3434
from cpython.bytes cimport PyBytes_AsString
3535
from libc.stdint cimport int32_t, uint8_t, uintptr_t
36-
from libc.string cimport memchr
36+
from libc.string cimport memchr, memcpy
3737
import platform
3838

3939

@@ -65,12 +65,18 @@ init_searcher()
6565

6666

6767
cdef inline int fast_memcmp_short(const char *a, const char *b, size_t n):
68-
"""Optimized memcmp for short strings (<= 8 bytes)"""
69-
cdef size_t i
70-
for i in range(n):
71-
if a[i] != b[i]:
72-
return 1
73-
return 0
68+
cdef uint64_t aval = 0, bval = 0
69+
cdef uint64_t mask
70+
71+
if n == 0:
72+
return 0
73+
elif n <= 8:
74+
mask = ((<uint64_t>1) << (8 * n)) - 1
75+
memcpy(&aval, a, n)
76+
memcpy(&bval, b, n)
77+
return (aval & mask) != (bval & mask)
78+
else:
79+
return memcmp(a, b, n) != 0
7480

7581

7682
cdef inline int boyer_moore_horspool(const char *haystack, size_t haystacklen,
@@ -146,7 +152,9 @@ cdef inline int boyer_moore_horspool_with_table(const char *haystack, size_t hay
146152
"""BMH with precomputed table - optimized version"""
147153
cdef size_t i = 0
148154
cdef size_t tail_index
155+
cdef size_t needlelen_sub1 = needlelen - 1
149156
cdef unsigned char last_char
157+
cdef unsigned char tail_char
150158

151159
if needlelen == 0 or haystacklen < needlelen:
152160
return 0
@@ -155,20 +163,24 @@ cdef inline int boyer_moore_horspool_with_table(const char *haystack, size_t hay
155163
if needlelen == 1:
156164
return memchr(haystack, needle[0], haystacklen) != NULL
157165

158-
last_char = <unsigned char>needle[needlelen - 1]
166+
last_char = <unsigned char>needle[needlelen_sub1]
167+
cdef size_t end_index = haystacklen - needlelen
159168

160-
while i <= haystacklen - needlelen:
161-
# Check last character first
162-
if haystack[i + needlelen - 1] == last_char:
163-
if needlelen <= 8:
164-
if fast_memcmp_short(&haystack[i], needle, needlelen) == 0:
165-
return 1
166-
else:
167-
if memcmp(&haystack[i], needle, needlelen) == 0:
168-
return 1
169+
while i <= end_index:
170+
tail_index = i + needlelen_sub1
171+
tail_char = <unsigned char>haystack[tail_index]
169172

170-
tail_index = i + needlelen - 1
171-
i += skip[<unsigned char>haystack[tail_index]]
173+
# Check last character first
174+
if tail_char == last_char:
175+
if haystack[i] == needle[0]:
176+
if needlelen <= 8:
177+
if fast_memcmp_short(&haystack[i], needle, needlelen) == 0:
178+
return 1
179+
else:
180+
if memcmp(&haystack[i], needle, needlelen) == 0:
181+
return 1
182+
183+
i += skip[tail_char]
172184

173185
return 0
174186

opteryx/config.py

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
from typing import Optional
1111
from typing import Union
1212

13-
import psutil
14-
1513
_config_values: dict = {}
1614

1715
# we need a preliminary version of this variable
@@ -30,8 +28,15 @@ def memory_allocation_calculation(allocation: Union[float, int]) -> int:
3028
Returns:
3129
int: Memory size in bytes to be allocated.
3230
"""
33-
total_memory = psutil.virtual_memory().total # Convert bytes to megabytes
31+
# Import psutil lazily to avoid paying the import cost at module import time.
32+
# Use a small helper so tests or callers that need the value will trigger the
33+
# import only when this function is called.
34+
def _get_total_memory_bytes() -> int:
35+
import psutil
36+
37+
return psutil.virtual_memory().total
3438

39+
total_memory = _get_total_memory_bytes()
3540
if 0 < allocation < 1: # Treat as a percentage
3641
return int(total_memory * allocation)
3742
elif allocation >= 1: # Treat as an absolute value in MB
@@ -44,9 +49,13 @@ def system_gigabytes() -> int:
4449
"""
4550
Get the total system memory in gigabytes.
4651
52+
This imports psutil lazily to avoid paying the cost at module import time.
53+
4754
Returns:
4855
int: Total system memory in gigabytes.
4956
"""
57+
import psutil
58+
5059
return psutil.virtual_memory().total // (1024 * 1024 * 1024)
5160

5261

@@ -163,20 +172,67 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]:
163172
MAX_CONSECUTIVE_CACHE_FAILURES: int = int(get("MAX_CONSECUTIVE_CACHE_FAILURES", 10))
164173
"""Maximum number of consecutive cache failures before disabling cache usage."""
165174

166-
MAX_LOCAL_BUFFER_CAPACITY: int = memory_allocation_calculation(float(get("MAX_LOCAL_BUFFER_CAPACITY", 0.2)))
167-
"""Local buffer pool size in either bytes or fraction of system memory."""
175+
# These values are computed lazily via __getattr__ to avoid importing
176+
# psutil (and making expensive system calls) during module import.
177+
# Annotate the names so type checkers know about them, but do not assign
178+
# values here — __getattr__ will compute and cache them on first access.
179+
MAX_LOCAL_BUFFER_CAPACITY: int
180+
"""Local buffer pool size in either bytes or fraction of system memory (lazy)."""
168181

169-
MAX_READ_BUFFER_CAPACITY: int = memory_allocation_calculation(float(get("MAX_READ_BUFFER_CAPACITY", 0.1)))
170-
"""Read buffer pool size in either bytes or fraction of system memory."""
182+
MAX_READ_BUFFER_CAPACITY: int
183+
"""Read buffer pool size in either bytes or fraction of system memory (lazy)."""
171184

172185
MAX_STATISTICS_CACHE_ITEMS: int = get("MAX_STATISTICS_CACHE_ITEMS", 10_000)
173186
"""The number of .parquet files we cache the statistics for."""
174187

175-
CONCURRENT_READS: int = int(get("CONCURRENT_READS", max(system_gigabytes(), 2)))
176-
"""Number of read workers per data source."""
188+
_LAZY_VALUES: dict = {}
189+
190+
191+
# Lazily computed configuration values. We compute certain values on first
192+
# access because they depend on expensive system calls (psutil) or other
193+
# runtime properties. Access these as attributes on the module; __getattr__
194+
# will compute and cache them.
195+
196+
CONCURRENT_WORKERS_DEFAULT = int(get("CONCURRENT_WORKERS", 2))
197+
198+
199+
def _compute_MAX_LOCAL_BUFFER_CAPACITY():
200+
return memory_allocation_calculation(float(get("MAX_LOCAL_BUFFER_CAPACITY", 0.2)))
201+
202+
203+
def _compute_MAX_READ_BUFFER_CAPACITY():
204+
return memory_allocation_calculation(float(get("MAX_READ_BUFFER_CAPACITY", 0.1)))
205+
206+
207+
def _compute_CONCURRENT_READS():
208+
# default to max(system_gigabytes(), 2)
209+
return int(get("CONCURRENT_READS", max(system_gigabytes(), 2)))
210+
177211

178-
CONCURRENT_WORKERS: int = int(get("CONCURRENT_WORKERS", 2))
179-
"""Number of worker threads created to execute queries."""
212+
def __getattr__(name: str):
213+
"""Lazy attribute access for computed config values."""
214+
if name == "MAX_LOCAL_BUFFER_CAPACITY":
215+
val = _LAZY_VALUES.get(name)
216+
if val is None:
217+
val = _compute_MAX_LOCAL_BUFFER_CAPACITY()
218+
_LAZY_VALUES[name] = val
219+
return val
220+
if name == "MAX_READ_BUFFER_CAPACITY":
221+
val = _LAZY_VALUES.get(name)
222+
if val is None:
223+
val = _compute_MAX_READ_BUFFER_CAPACITY()
224+
_LAZY_VALUES[name] = val
225+
return val
226+
if name == "CONCURRENT_READS":
227+
val = _LAZY_VALUES.get(name)
228+
if val is None:
229+
val = _compute_CONCURRENT_READS()
230+
_LAZY_VALUES[name] = val
231+
return val
232+
if name == "CONCURRENT_WORKERS":
233+
# simple default, no expensive computation
234+
return CONCURRENT_WORKERS_DEFAULT
235+
raise AttributeError(name)
180236

181237
DATA_CATALOG_PROVIDER: str = get("DATA_CATALOG_PROVIDER")
182238
"""Data Catalog provider."""
@@ -197,6 +253,7 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]:
197253
# don't output resource (memory) utilization information
198254
ENABLE_RESOURCE_LOGGING: bool = bool(get("ENABLE_RESOURCE_LOGGING", False))
199255
# size of morsels to push between steps
256+
# MORSEL_SIZE remains a plain constant
200257
MORSEL_SIZE: int = int(get("MORSEL_SIZE", 64 * 1024 * 1024))
201258
# not GA
202259
PROFILE_LOCATION:str = get("PROFILE_LOCATION")

opteryx/connectors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ def get_dataset_schema(self, dataset):
8787

8888
import os
8989

90-
import pyarrow
9190

9291
# Lazy imports - connectors are only loaded when actually needed
9392
# This significantly improves module import time from ~500ms to ~130ms
@@ -196,6 +195,7 @@ def register_df(name, frame):
196195
"""register a orso, pandas or Polars dataframe"""
197196
# Lazy import ArrowConnector
198197
from opteryx.connectors.arrow_connector import ArrowConnector
198+
import pyarrow
199199

200200
# polars (maybe others) - the polars to arrow API is a mess
201201
if hasattr(frame, "_df"): # pragma: no cover

opteryx/connectors/virtual_data.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
import datetime
1515
import typing
1616

17-
import pyarrow
1817
from orso.schema import RelationSchema
1918

20-
from opteryx import virtual_datasets
19+
import importlib
20+
from typing import Tuple
2121
from opteryx.connectors.base.base_connector import BaseConnector
2222
from opteryx.connectors.base.base_connector import DatasetReader
2323
from opteryx.connectors.capabilities import Partitionable
@@ -26,19 +26,32 @@
2626
from opteryx.utils import arrow
2727

2828
WELL_KNOWN_DATASETS = {
29-
"$astronauts": (virtual_datasets.astronauts, True),
30-
"$planets": (virtual_datasets.planets, True),
31-
"$missions": (virtual_datasets.missions, True),
32-
"$satellites": (virtual_datasets.satellites, True),
33-
"$variables": (virtual_datasets.variables, True),
34-
"$derived": (virtual_datasets.derived, False),
35-
"$no_table": (virtual_datasets.no_table, False),
36-
"$statistics": (virtual_datasets.statistics, True),
37-
"$stop_words": (virtual_datasets.stop_words, True),
38-
"$user": (virtual_datasets.user, True),
29+
"$astronauts": ("opteryx.virtual_datasets.astronaut_data", True),
30+
"$planets": ("opteryx.virtual_datasets.planet_data", True),
31+
"$missions": ("opteryx.virtual_datasets.missions", True),
32+
"$satellites": ("opteryx.virtual_datasets.satellite_data", True),
33+
"$variables": ("opteryx.virtual_datasets.variables_data", True),
34+
"$derived": ("opteryx.virtual_datasets.derived_data", False),
35+
"$no_table": ("opteryx.virtual_datasets.no_table_data", False),
36+
"$statistics": ("opteryx.virtual_datasets.statistics", True),
37+
"$stop_words": ("opteryx.virtual_datasets.stop_words", True),
38+
"$user": ("opteryx.virtual_datasets.user", True),
3939
}
4040

4141

42+
def _load_provider(name: str) -> Tuple[object, bool]:
43+
"""Lazily import and return the virtual dataset provider module and suggestable flag.
44+
45+
Returns (module, suggestable)
46+
"""
47+
entry = WELL_KNOWN_DATASETS.get(name)
48+
if entry is None:
49+
return None, False
50+
module_path, suggestable = entry
51+
module = importlib.import_module(module_path)
52+
return module, suggestable
53+
54+
4255
def suggest(dataset):
4356
"""
4457
Provide suggestions to the user if they gave a table that doesn't exist.
@@ -81,7 +94,7 @@ def get_dataset_schema(self) -> RelationSchema:
8194
if self.dataset not in WELL_KNOWN_DATASETS:
8295
suggestion = suggest(self.dataset)
8396
raise DatasetNotFoundError(suggestion=suggestion, dataset=self.dataset)
84-
data_provider, _ = WELL_KNOWN_DATASETS.get(self.dataset)
97+
data_provider, _ = _load_provider(self.dataset)
8598
self.relation_statistics = data_provider.statistics()
8699
return data_provider.schema()
87100

@@ -107,20 +120,22 @@ def __init__(
107120
self.date = date
108121
self.variables = variables
109122

110-
def __next__(self) -> pyarrow.Table:
123+
def __next__(self) -> "pyarrow.Table":
111124
"""
112125
Read the next chunk or morsel from the dataset.
113126
114127
Returns:
115128
A pyarrow Table representing a chunk or morsel of the dataset.
116129
raises StopIteration if the dataset is exhausted.
117130
"""
131+
import pyarrow
132+
118133
if self.exhausted:
119134
raise StopIteration("Dataset has been read.")
120135

121136
self.exhausted = True
122137

123-
data_provider, _ = WELL_KNOWN_DATASETS.get(self.dataset_name)
138+
data_provider, _ = _load_provider(self.dataset_name)
124139
if data_provider is None:
125140
suggestion = suggest(self.dataset_name.lower())
126141
raise DatasetNotFoundError(suggestion=suggestion, dataset=self.dataset_name)

0 commit comments

Comments
 (0)