perf: Optimize file I/O, regex compilation, and logging (#207)

Copilot · sam-hey · web-flow · commit bb7298224f8a · 2026-01-19T14:54:54.000+01:00
* Initial plan

* perf: Optimize file I/O, regex patterns, and hash functions

- Use context managers for file operations to ensure proper resource cleanup
- Precompile regex patterns for repeated use (whitespace, URL extraction, sentence splitting)
- Replace expensive SHA256 hash with native Python hash for PydanticModel
- Optimize logging serialization to handle bool type explicitly
- Improve DataFrame sorting to avoid work on empty dataframes
- Reduce redundant os.path.abspath calls in warnings_to_logger

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

* fix: Correct import order for regex pattern definitions

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

* style: Apply ruff formatting

* fix: Address code review feedback

- Fix regex pattern to properly match newlines (not escaped backslash)
- Maintain backward compatibility in logging structure (nested extra dict)
- Ensure log parsing systems continue to work with existing format

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

* fix: Maintain deterministic hash for compatibility

- Keep SHA256-based hash for determinism (Python's hash() is randomized)
- Optimize by building tuple first instead of multiple string concatenations
- This ensures hash values are consistent across Python sessions
- Fixes test failure in test_metadata_field_metadata

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

* perf: Further optimize hash function per code review

- Move hashlib import to module level to avoid repeated import overhead
- Join generator directly instead of building intermediate tuple
- Maintains deterministic SHA256 hash for backward compatibility
- Addresses code review suggestions for performance

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

* revert: Remove hash function optimization from PR

Reverted __hash__ method to original implementation as requested.
The hash function changes will be addressed in a separate PR.
File I/O improvements with context managers remain in place.

Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: sam-hey &lt;40773225+sam-hey@users.noreply.github.com&gt;
diff --git a/wurzel/datacontract/datacontract.py b/wurzel/datacontract/datacontract.py
@@ -53,7 +53,8 @@ def load_from_path(cls, path: Path, *args) -> Self:
         import pandas as pd  # pylint: disable=import-outside-toplevel
 
         # Load CSV from path
-        read_data = pd.read_csv(path.open(encoding="utf-8"))
+        with path.open(encoding="utf-8") as f:
+            read_data = pd.read_csv(f)
 
         def _literal_eval_or_passthrough(value):
             """Convert stringified literals to Python objects because pandas keeps CSV cells as strings."""
@@ -124,9 +125,11 @@ def load_from_path(cls, path: Path, model_type: type[Union[Self, list[Self]]]) -
             model_type = [ty for ty in typing.get_args(model_type) if ty][0]
         if get_origin(model_type) is None:
             if issubclass(model_type, pydantic.BaseModel):
-                return cls(**json.load(path.open(encoding="utf-8")))
+                with path.open(encoding="utf-8") as f:
+                    return cls(**json.load(f))
         elif get_origin(model_type) is list:
-            data = json.load(path.open(encoding="utf-8"))
+            with path.open(encoding="utf-8") as f:
+                data = json.load(f)
             for i, entry in enumerate(data):
                 data[i] = cls(**entry)
             return data
diff --git a/wurzel/step_executor/base_executor.py b/wurzel/step_executor/base_executor.py
@@ -65,19 +65,21 @@ def _try_sort(x: StepReturnType) -> StepReturnType:
 
     Returns either a sorted x or x itself
     """
-    _log_extra = {"extra": {"type": type(x)}}
     if isinstance(x, PydanticModel):
         return x
     try:
         if isinstance(x, (list, set)):
             return sorted(x)
         if isinstance(x, pandas.DataFrame):
-            return x.sort_values(x.columns[0])
+            # Only sort if DataFrame has columns and is not empty
+            if not x.empty and len(x.columns) > 0:
+                return x.sort_values(x.columns[0])
+            return x
     # pylint: disable-next=bare-except
     except:  # noqa: E722
-        log.warning("Could not sort output", **_log_extra)
+        log.warning("Could not sort output", extra={"extra": {"type": type(x).__name__}})
         return x
-    log.warning("Can't sort objects of this type", **_log_extra)
+    log.warning("Can't sort objects of this type", extra={"extra": {"type": type(x).__name__}})
     return x
 
 
diff --git a/wurzel/steps/embedding/step.py b/wurzel/steps/embedding/step.py
@@ -30,6 +30,10 @@
 
 log = getLogger(__name__)
 
+# Precompile regex patterns for performance
+_WHITESPACE_PATTERN = re.compile(r"([.,!?]+)?\s+")
+_URL_PATTERN = re.compile(r"https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&/=]*)")
+
 
 class Embedded(TypedDict):
     """dict definition of a embedded document."""
@@ -164,7 +168,7 @@ def is_stopword(self, word: str) -> bool:
     @classmethod
     def whitespace_word_tokenizer(cls, text: str) -> list[str]:
         """Simple Regex based whitespace word tokenizer."""
-        return [x for x in re.split(r"([.,!?]+)?\s+", text) if x]
+        return [x for x in _WHITESPACE_PATTERN.split(text) if x]
 
     def get_simple_context(self, text):
         """Simple function to create a context from a text."""
@@ -214,11 +218,8 @@ def _replace_link(cls, text: str):
             The text with URLs replaced by 'LINK'.
 
         """
-        # Extract URL from a string
-        url_extract_pattern = (
-            "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"  # pylint: disable=line-too-long
-        )
-        links = sorted(re.findall(url_extract_pattern, text), key=len, reverse=True)
+        # Use precompiled pattern for better performance
+        links = sorted(_URL_PATTERN.findall(text), key=len, reverse=True)
         for link in links:
             text = text.replace(link, "LINK")
         return text
diff --git a/wurzel/utils/logging.py b/wurzel/utils/logging.py
@@ -42,9 +42,11 @@ def warnings_to_logger(message: str, category: str, filename: str, lineno: str,
 
     """
     # pylint: disable=unused-argument
+    # Optimize by computing absolute path once
+    abs_filename = os.path.abspath(filename)
     for module_name, module in sys.modules.items():
         module_path = getattr(module, "__file__", None)
-        if module_path and os.path.abspath(module_path) == os.path.abspath(filename):
+        if module_path and os.path.abspath(module_path) == abs_filename:
             break
     else:
         module_name = os.path.splitext(os.path.split(filename)[1])[0]
@@ -69,9 +71,12 @@ def _make_dict_serializable(item: Any):
                 key = k if isinstance(k, str) else repr(k)
                 new_dict[key] = _make_dict_serializable(v)
             return new_dict
-        case str() | int() | float():
+        case str() | int() | float() | bool():
             return item
-        case list() | set():
+        case list():
+            return [_make_dict_serializable(i) for i in item]
+        case set():
+            # Convert set to list for JSON serialization
             return [_make_dict_serializable(i) for i in item]
         case _:
             return repr(item)
diff --git a/wurzel/utils/splitters/semantic_splitter.py b/wurzel/utils/splitters/semantic_splitter.py
@@ -21,6 +21,9 @@
 from wurzel.utils.to_markdown.html2md import MD_RENDER_LOCK
 from wurzel.utils.tokenizers import Tokenizer
 
+# Precompile regex pattern for performance
+_SENTENCE_SPLIT_PATTERN = re.compile(r"\.(?=\s|\n)")
+
 LEVEL_MAPPING = {
     block_token.Heading: 0,  # actually 1-6
     block_token.List: 7,
@@ -452,7 +455,7 @@ def _split_by_sentence(self, text: str) -> list[str]:
         needed_splits = lenth // token_limit
         if not needed_splits:
             return [text]
-        sentences = [(self._get_token_len(sent), f"{sent}. ") for sent in re.split(r"\.(?=\s|\\n)", text) if sent.strip()]
+        sentences = [(self._get_token_len(sent), f"{sent}. ") for sent in _SENTENCE_SPLIT_PATTERN.split(text) if sent.strip()]
         chunks: list[str] = []
         chunk = ""
         chunk_len = 0