mabel-dev
diff --git a/‎opteryx/__init__.py‎
Lines changed: 13 additions & 9 deletions b/‎opteryx/__init__.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎opteryx/__main__.py‎
Lines changed: 1 addition & 5 deletions b/‎opteryx/__main__.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions b/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opteryx/compiled/list_ops/list_in_string.pyx‎
Lines changed: 31 additions & 19 deletions b/‎opteryx/compiled/list_ops/list_in_string.pyx‎
Lines changed: 31 additions & 19 deletions
diff --git a/‎opteryx/config.py‎
Lines changed: 68 additions & 11 deletions b/‎opteryx/config.py‎
Lines changed: 68 additions & 11 deletions
diff --git a/‎opteryx/connectors/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎opteryx/connectors/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opteryx/connectors/virtual_data.py‎
Lines changed: 30 additions & 15 deletions b/‎opteryx/connectors/virtual_data.py‎
Lines changed: 30 additions & 15 deletions
@@ -25,9 +25,10 @@
 from pathlib import Path
 
 from decimal import getcontext
-from typing import Optional, Union, Dict, Any, List
+from typing import Optional, Union, Dict, Any, List, TYPE_CHECKING
 
-import pyarrow
+if TYPE_CHECKING:  # pragma: no cover - only for type checkers
+    import pyarrow
 
 # Set Decimal precision to 28 globally
 getcontext().prec = 28
@@ -51,16 +52,19 @@ def is_mac() -> bool:  # pragma: no cover
 
 
 # python-dotenv allows us to create an environment file to store secrets.
-try:
-    import dotenv  # type:ignore
+# Only try to import dotenv if a .env file exists to avoid paying the
+# import cost when no environment file is present.
+_env_path = Path.cwd() / ".env"
+if _env_path.exists():
+    try:
+        import dotenv  # type:ignore
 
-    _env_path = Path.cwd() / ".env"
-    if _env_path.exists() and dotenv is not None:
         dotenv.load_dotenv(dotenv_path=_env_path)
         if OPTERYX_DEBUG:
             print(f"{datetime.datetime.now()} [LOADER] Loading `.env` file.")
-except ImportError:  # pragma: no cover
-    pass
+    except ImportError:  # pragma: no cover
+        # dotenv is optional; if it's not installed, just continue.
+        pass
 
 
 if OPTERYX_DEBUG:  # pragma: no cover
@@ -179,7 +183,7 @@ def query_to_arrow(
     visibility_filters: Optional[Dict[str, Any]] = None,
     limit: int = None,
     **kwargs,
-) -> pyarrow.Table:
+ ) -> "pyarrow.Table":
     """
     Helper function to execute a query and return a pyarrow Table.
 
 
@@ -12,7 +12,6 @@
 
 import argparse
 import os
-import readline
 import sys
 import threading
 import time
@@ -25,10 +24,6 @@
     from opteryx.utils.sql import clean_statement
     from opteryx.utils.sql import remove_comments
 
-
-if readline:
-    pass
-
 # Define ANSI color codes
 ANSI_RED = "\u001b[31m"
 ANSI_RESET = "\u001b[0m"
@@ -209,6 +204,7 @@ def main():
 
 
 if __name__ == "__main__":
+    import readline  # pragma: no cover
     try:
         main()
     except Exception as e:
 
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1685
+__build__ = 1688
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1685"
+__version__ = "0.26.0-beta.1688"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -33,7 +33,7 @@ numpy.import_array()
 
 from cpython.bytes cimport PyBytes_AsString
 from libc.stdint cimport int32_t, uint8_t, uintptr_t
-from libc.string cimport memchr
+from libc.string cimport memchr, memcpy
 import platform
 
 
@@ -65,12 +65,18 @@ init_searcher()
 
 
 cdef inline int fast_memcmp_short(const char *a, const char *b, size_t n):
-    """Optimized memcmp for short strings (<= 8 bytes)"""
-    cdef size_t i
-    for i in range(n):
-        if a[i] != b[i]:
-            return 1
-    return 0
+    cdef uint64_t aval = 0, bval = 0
+    cdef uint64_t mask
+    
+    if n == 0:
+        return 0
+    elif n <= 8:
+        mask = ((<uint64_t>1) << (8 * n)) - 1
+        memcpy(&aval, a, n)
+        memcpy(&bval, b, n)
+        return (aval & mask) != (bval & mask)
+    else:
+        return memcmp(a, b, n) != 0
 
 
 cdef inline int boyer_moore_horspool(const char *haystack, size_t haystacklen,
@@ -146,7 +152,9 @@ cdef inline int boyer_moore_horspool_with_table(const char *haystack, size_t hay
     """BMH with precomputed table - optimized version"""
     cdef size_t i = 0
     cdef size_t tail_index
+    cdef size_t needlelen_sub1 = needlelen - 1
     cdef unsigned char last_char
+    cdef unsigned char tail_char
 
     if needlelen == 0 or haystacklen < needlelen:
         return 0
@@ -155,20 +163,24 @@ cdef inline int boyer_moore_horspool_with_table(const char *haystack, size_t hay
     if needlelen == 1:
         return memchr(haystack, needle[0], haystacklen) != NULL
 
-    last_char = <unsigned char>needle[needlelen - 1]
+    last_char = <unsigned char>needle[needlelen_sub1]
+    cdef size_t end_index = haystacklen - needlelen
 
-    while i <= haystacklen - needlelen:
-        # Check last character first
-        if haystack[i + needlelen - 1] == last_char:
-            if needlelen <= 8:
-                if fast_memcmp_short(&haystack[i], needle, needlelen) == 0:
-                    return 1
-            else:
-                if memcmp(&haystack[i], needle, needlelen) == 0:
-                    return 1
+    while i <= end_index:
+        tail_index = i + needlelen_sub1
+        tail_char = <unsigned char>haystack[tail_index]
 
-        tail_index = i + needlelen - 1
-        i += skip[<unsigned char>haystack[tail_index]]
+        # Check last character first
+        if tail_char == last_char:
+            if haystack[i] == needle[0]:
+                if needlelen <= 8:
+                    if fast_memcmp_short(&haystack[i], needle, needlelen) == 0:
+                        return 1
+                else:
+                    if memcmp(&haystack[i], needle, needlelen) == 0:
+                        return 1
+
+        i += skip[tail_char]
 
     return 0
 
 
@@ -10,8 +10,6 @@
 from typing import Optional
 from typing import Union
 
-import psutil
-
 _config_values: dict = {}
 
 # we need a preliminary version of this variable
@@ -30,8 +28,15 @@ def memory_allocation_calculation(allocation: Union[float, int]) -> int:
     Returns:
         int: Memory size in bytes to be allocated.
     """
-    total_memory = psutil.virtual_memory().total  # Convert bytes to megabytes
+    # Import psutil lazily to avoid paying the import cost at module import time.
+    # Use a small helper so tests or callers that need the value will trigger the
+    # import only when this function is called.
+    def _get_total_memory_bytes() -> int:
+        import psutil
+
+        return psutil.virtual_memory().total
 
+    total_memory = _get_total_memory_bytes()
     if 0 < allocation < 1:  # Treat as a percentage
         return int(total_memory * allocation)
     elif allocation >= 1:  # Treat as an absolute value in MB
@@ -44,9 +49,13 @@ def system_gigabytes() -> int:
     """
     Get the total system memory in gigabytes.
 
+    This imports psutil lazily to avoid paying the cost at module import time.
+
     Returns:
         int: Total system memory in gigabytes.
     """
+    import psutil
+
     return psutil.virtual_memory().total // (1024 * 1024 * 1024)
 
 
@@ -163,20 +172,67 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]:
 MAX_CONSECUTIVE_CACHE_FAILURES: int = int(get("MAX_CONSECUTIVE_CACHE_FAILURES", 10))
 """Maximum number of consecutive cache failures before disabling cache usage."""
 
-MAX_LOCAL_BUFFER_CAPACITY: int = memory_allocation_calculation(float(get("MAX_LOCAL_BUFFER_CAPACITY", 0.2)))
-"""Local buffer pool size in either bytes or fraction of system memory."""
+# These values are computed lazily via __getattr__ to avoid importing
+# psutil (and making expensive system calls) during module import.
+# Annotate the names so type checkers know about them, but do not assign
+# values here — __getattr__ will compute and cache them on first access.
+MAX_LOCAL_BUFFER_CAPACITY: int
+"""Local buffer pool size in either bytes or fraction of system memory (lazy)."""
 
-MAX_READ_BUFFER_CAPACITY: int = memory_allocation_calculation(float(get("MAX_READ_BUFFER_CAPACITY", 0.1)))
-"""Read buffer pool size in either bytes or fraction of system memory."""
+MAX_READ_BUFFER_CAPACITY: int
+"""Read buffer pool size in either bytes or fraction of system memory (lazy)."""
 
 MAX_STATISTICS_CACHE_ITEMS: int = get("MAX_STATISTICS_CACHE_ITEMS", 10_000)
 """The number of .parquet files we cache the statistics for."""
 
-CONCURRENT_READS: int = int(get("CONCURRENT_READS", max(system_gigabytes(), 2)))
-"""Number of read workers per data source."""
+_LAZY_VALUES: dict = {}
+
+
+# Lazily computed configuration values. We compute certain values on first
+# access because they depend on expensive system calls (psutil) or other
+# runtime properties. Access these as attributes on the module; __getattr__
+# will compute and cache them.
+
+CONCURRENT_WORKERS_DEFAULT = int(get("CONCURRENT_WORKERS", 2))
+
+
+def _compute_MAX_LOCAL_BUFFER_CAPACITY():
+    return memory_allocation_calculation(float(get("MAX_LOCAL_BUFFER_CAPACITY", 0.2)))
+
+
+def _compute_MAX_READ_BUFFER_CAPACITY():
+    return memory_allocation_calculation(float(get("MAX_READ_BUFFER_CAPACITY", 0.1)))
+
+
+def _compute_CONCURRENT_READS():
+    # default to max(system_gigabytes(), 2)
+    return int(get("CONCURRENT_READS", max(system_gigabytes(), 2)))
+
 
-CONCURRENT_WORKERS: int = int(get("CONCURRENT_WORKERS", 2))
-"""Number of worker threads created to execute queries."""
+def __getattr__(name: str):
+    """Lazy attribute access for computed config values."""
+    if name == "MAX_LOCAL_BUFFER_CAPACITY":
+        val = _LAZY_VALUES.get(name)
+        if val is None:
+            val = _compute_MAX_LOCAL_BUFFER_CAPACITY()
+            _LAZY_VALUES[name] = val
+        return val
+    if name == "MAX_READ_BUFFER_CAPACITY":
+        val = _LAZY_VALUES.get(name)
+        if val is None:
+            val = _compute_MAX_READ_BUFFER_CAPACITY()
+            _LAZY_VALUES[name] = val
+        return val
+    if name == "CONCURRENT_READS":
+        val = _LAZY_VALUES.get(name)
+        if val is None:
+            val = _compute_CONCURRENT_READS()
+            _LAZY_VALUES[name] = val
+        return val
+    if name == "CONCURRENT_WORKERS":
+        # simple default, no expensive computation
+        return CONCURRENT_WORKERS_DEFAULT
+    raise AttributeError(name)
 
 DATA_CATALOG_PROVIDER: str = get("DATA_CATALOG_PROVIDER")
 """Data Catalog provider."""
@@ -197,6 +253,7 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]:
 # don't output resource (memory) utilization information
 ENABLE_RESOURCE_LOGGING: bool = bool(get("ENABLE_RESOURCE_LOGGING", False))
 # size of morsels to push between steps
+# MORSEL_SIZE remains a plain constant
 MORSEL_SIZE: int = int(get("MORSEL_SIZE", 64 * 1024 * 1024))
 # not GA
 PROFILE_LOCATION:str = get("PROFILE_LOCATION")
 
@@ -87,7 +87,6 @@ def get_dataset_schema(self, dataset):
 
 import os
 
-import pyarrow
 
 # Lazy imports - connectors are only loaded when actually needed
 # This significantly improves module import time from ~500ms to ~130ms
@@ -196,6 +195,7 @@ def register_df(name, frame):
     """register a orso, pandas or Polars dataframe"""
     # Lazy import ArrowConnector
     from opteryx.connectors.arrow_connector import ArrowConnector
+    import pyarrow
 
     # polars (maybe others) - the polars to arrow API is a mess
     if hasattr(frame, "_df"):  # pragma: no cover
 
@@ -14,10 +14,10 @@
 import datetime
 import typing
 
-import pyarrow
 from orso.schema import RelationSchema
 
-from opteryx import virtual_datasets
+import importlib
+from typing import Tuple
 from opteryx.connectors.base.base_connector import BaseConnector
 from opteryx.connectors.base.base_connector import DatasetReader
 from opteryx.connectors.capabilities import Partitionable
@@ -26,19 +26,32 @@
 from opteryx.utils import arrow
 
 WELL_KNOWN_DATASETS = {
-    "$astronauts": (virtual_datasets.astronauts, True),
-    "$planets": (virtual_datasets.planets, True),
-    "$missions": (virtual_datasets.missions, True),
-    "$satellites": (virtual_datasets.satellites, True),
-    "$variables": (virtual_datasets.variables, True),
-    "$derived": (virtual_datasets.derived, False),
-    "$no_table": (virtual_datasets.no_table, False),
-    "$statistics": (virtual_datasets.statistics, True),
-    "$stop_words": (virtual_datasets.stop_words, True),
-    "$user": (virtual_datasets.user, True),
+    "$astronauts": ("opteryx.virtual_datasets.astronaut_data", True),
+    "$planets": ("opteryx.virtual_datasets.planet_data", True),
+    "$missions": ("opteryx.virtual_datasets.missions", True),
+    "$satellites": ("opteryx.virtual_datasets.satellite_data", True),
+    "$variables": ("opteryx.virtual_datasets.variables_data", True),
+    "$derived": ("opteryx.virtual_datasets.derived_data", False),
+    "$no_table": ("opteryx.virtual_datasets.no_table_data", False),
+    "$statistics": ("opteryx.virtual_datasets.statistics", True),
+    "$stop_words": ("opteryx.virtual_datasets.stop_words", True),
+    "$user": ("opteryx.virtual_datasets.user", True),
 }
 
 
+def _load_provider(name: str) -> Tuple[object, bool]:
+    """Lazily import and return the virtual dataset provider module and suggestable flag.
+
+    Returns (module, suggestable)
+    """
+    entry = WELL_KNOWN_DATASETS.get(name)
+    if entry is None:
+        return None, False
+    module_path, suggestable = entry
+    module = importlib.import_module(module_path)
+    return module, suggestable
+
+
 def suggest(dataset):
     """
     Provide suggestions to the user if they gave a table that doesn't exist.
@@ -81,7 +94,7 @@ def get_dataset_schema(self) -> RelationSchema:
         if self.dataset not in WELL_KNOWN_DATASETS:
             suggestion = suggest(self.dataset)
             raise DatasetNotFoundError(suggestion=suggestion, dataset=self.dataset)
-        data_provider, _ = WELL_KNOWN_DATASETS.get(self.dataset)
+        data_provider, _ = _load_provider(self.dataset)
         self.relation_statistics = data_provider.statistics()
         return data_provider.schema()
 
@@ -107,20 +120,22 @@ def __init__(
         self.date = date
         self.variables = variables
 
-    def __next__(self) -> pyarrow.Table:
+    def __next__(self) -> "pyarrow.Table":
         """
         Read the next chunk or morsel from the dataset.
 
         Returns:
             A pyarrow Table representing a chunk or morsel of the dataset.
             raises StopIteration if the dataset is exhausted.
         """
+        import pyarrow
+
         if self.exhausted:
             raise StopIteration("Dataset has been read.")
 
         self.exhausted = True
 
-        data_provider, _ = WELL_KNOWN_DATASETS.get(self.dataset_name)
+        data_provider, _ = _load_provider(self.dataset_name)
         if data_provider is None:
             suggestion = suggest(self.dataset_name.lower())
             raise DatasetNotFoundError(suggestion=suggestion, dataset=self.dataset_name)