Skip to content

Commit bb72982

Browse files
Copilotsam-hey
andauthored
perf: Optimize file I/O, regex compilation, and logging (#207)
* Initial plan * perf: Optimize file I/O, regex patterns, and hash functions - Use context managers for file operations to ensure proper resource cleanup - Precompile regex patterns for repeated use (whitespace, URL extraction, sentence splitting) - Replace expensive SHA256 hash with native Python hash for PydanticModel - Optimize logging serialization to handle bool type explicitly - Improve DataFrame sorting to avoid work on empty dataframes - Reduce redundant os.path.abspath calls in warnings_to_logger Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> * fix: Correct import order for regex pattern definitions Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> * style: Apply ruff formatting * fix: Address code review feedback - Fix regex pattern to properly match newlines (not escaped backslash) - Maintain backward compatibility in logging structure (nested extra dict) - Ensure log parsing systems continue to work with existing format Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> * fix: Maintain deterministic hash for compatibility - Keep SHA256-based hash for determinism (Python's hash() is randomized) - Optimize by building tuple first instead of multiple string concatenations - This ensures hash values are consistent across Python sessions - Fixes test failure in test_metadata_field_metadata Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> * perf: Further optimize hash function per code review - Move hashlib import to module level to avoid repeated import overhead - Join generator directly instead of building intermediate tuple - Maintains deterministic SHA256 hash for backward compatibility - Addresses code review suggestions for performance Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> * revert: Remove hash function optimization from PR Reverted __hash__ method to original implementation as requested. The hash function changes will be addressed in a separate PR. File I/O improvements with context managers remain in place. Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: sam-hey <40773225+sam-hey@users.noreply.github.com>
1 parent f29b6ea commit bb72982

File tree

5 files changed

+31
-17
lines changed

5 files changed

+31
-17
lines changed

wurzel/datacontract/datacontract.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def load_from_path(cls, path: Path, *args) -> Self:
5353
import pandas as pd # pylint: disable=import-outside-toplevel
5454

5555
# Load CSV from path
56-
read_data = pd.read_csv(path.open(encoding="utf-8"))
56+
with path.open(encoding="utf-8") as f:
57+
read_data = pd.read_csv(f)
5758

5859
def _literal_eval_or_passthrough(value):
5960
"""Convert stringified literals to Python objects because pandas keeps CSV cells as strings."""
@@ -124,9 +125,11 @@ def load_from_path(cls, path: Path, model_type: type[Union[Self, list[Self]]]) -
124125
model_type = [ty for ty in typing.get_args(model_type) if ty][0]
125126
if get_origin(model_type) is None:
126127
if issubclass(model_type, pydantic.BaseModel):
127-
return cls(**json.load(path.open(encoding="utf-8")))
128+
with path.open(encoding="utf-8") as f:
129+
return cls(**json.load(f))
128130
elif get_origin(model_type) is list:
129-
data = json.load(path.open(encoding="utf-8"))
131+
with path.open(encoding="utf-8") as f:
132+
data = json.load(f)
130133
for i, entry in enumerate(data):
131134
data[i] = cls(**entry)
132135
return data

wurzel/step_executor/base_executor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,19 +65,21 @@ def _try_sort(x: StepReturnType) -> StepReturnType:
6565
6666
Returns either a sorted x or x itself
6767
"""
68-
_log_extra = {"extra": {"type": type(x)}}
6968
if isinstance(x, PydanticModel):
7069
return x
7170
try:
7271
if isinstance(x, (list, set)):
7372
return sorted(x)
7473
if isinstance(x, pandas.DataFrame):
75-
return x.sort_values(x.columns[0])
74+
# Only sort if DataFrame has columns and is not empty
75+
if not x.empty and len(x.columns) > 0:
76+
return x.sort_values(x.columns[0])
77+
return x
7678
# pylint: disable-next=bare-except
7779
except: # noqa: E722
78-
log.warning("Could not sort output", **_log_extra)
80+
log.warning("Could not sort output", extra={"extra": {"type": type(x).__name__}})
7981
return x
80-
log.warning("Can't sort objects of this type", **_log_extra)
82+
log.warning("Can't sort objects of this type", extra={"extra": {"type": type(x).__name__}})
8183
return x
8284

8385

wurzel/steps/embedding/step.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030

3131
log = getLogger(__name__)
3232

33+
# Precompile regex patterns for performance
34+
_WHITESPACE_PATTERN = re.compile(r"([.,!?]+)?\s+")
35+
_URL_PATTERN = re.compile(r"https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&/=]*)")
36+
3337

3438
class Embedded(TypedDict):
3539
"""dict definition of a embedded document."""
@@ -164,7 +168,7 @@ def is_stopword(self, word: str) -> bool:
164168
@classmethod
165169
def whitespace_word_tokenizer(cls, text: str) -> list[str]:
166170
"""Simple Regex based whitespace word tokenizer."""
167-
return [x for x in re.split(r"([.,!?]+)?\s+", text) if x]
171+
return [x for x in _WHITESPACE_PATTERN.split(text) if x]
168172

169173
def get_simple_context(self, text):
170174
"""Simple function to create a context from a text."""
@@ -214,11 +218,8 @@ def _replace_link(cls, text: str):
214218
The text with URLs replaced by 'LINK'.
215219
216220
"""
217-
# Extract URL from a string
218-
url_extract_pattern = (
219-
"https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)" # pylint: disable=line-too-long
220-
)
221-
links = sorted(re.findall(url_extract_pattern, text), key=len, reverse=True)
221+
# Use precompiled pattern for better performance
222+
links = sorted(_URL_PATTERN.findall(text), key=len, reverse=True)
222223
for link in links:
223224
text = text.replace(link, "LINK")
224225
return text

wurzel/utils/logging.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,11 @@ def warnings_to_logger(message: str, category: str, filename: str, lineno: str,
4242
4343
"""
4444
# pylint: disable=unused-argument
45+
# Optimize by computing absolute path once
46+
abs_filename = os.path.abspath(filename)
4547
for module_name, module in sys.modules.items():
4648
module_path = getattr(module, "__file__", None)
47-
if module_path and os.path.abspath(module_path) == os.path.abspath(filename):
49+
if module_path and os.path.abspath(module_path) == abs_filename:
4850
break
4951
else:
5052
module_name = os.path.splitext(os.path.split(filename)[1])[0]
@@ -69,9 +71,12 @@ def _make_dict_serializable(item: Any):
6971
key = k if isinstance(k, str) else repr(k)
7072
new_dict[key] = _make_dict_serializable(v)
7173
return new_dict
72-
case str() | int() | float():
74+
case str() | int() | float() | bool():
7375
return item
74-
case list() | set():
76+
case list():
77+
return [_make_dict_serializable(i) for i in item]
78+
case set():
79+
# Convert set to list for JSON serialization
7580
return [_make_dict_serializable(i) for i in item]
7681
case _:
7782
return repr(item)

wurzel/utils/splitters/semantic_splitter.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
from wurzel.utils.to_markdown.html2md import MD_RENDER_LOCK
2222
from wurzel.utils.tokenizers import Tokenizer
2323

24+
# Precompile regex pattern for performance
25+
_SENTENCE_SPLIT_PATTERN = re.compile(r"\.(?=\s|\n)")
26+
2427
LEVEL_MAPPING = {
2528
block_token.Heading: 0, # actually 1-6
2629
block_token.List: 7,
@@ -452,7 +455,7 @@ def _split_by_sentence(self, text: str) -> list[str]:
452455
needed_splits = lenth // token_limit
453456
if not needed_splits:
454457
return [text]
455-
sentences = [(self._get_token_len(sent), f"{sent}. ") for sent in re.split(r"\.(?=\s|\\n)", text) if sent.strip()]
458+
sentences = [(self._get_token_len(sent), f"{sent}. ") for sent in _SENTENCE_SPLIT_PATTERN.split(text) if sent.strip()]
456459
chunks: list[str] = []
457460
chunk = ""
458461
chunk_len = 0

0 commit comments

Comments
 (0)