Skip to content

Commit 52d2baf

Browse files
committed
Microptimized left bisect.
1 parent d70d7b9 commit 52d2baf

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

src/semchunk/semchunk.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import math
55
import inspect
66

7-
from bisect import bisect_left
87
from typing import Callable, Sequence, TYPE_CHECKING
98
from functools import cache
109
from itertools import accumulate
@@ -62,6 +61,20 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
6261
# Return the splitter and the split text.
6362
return splitter, splitter_is_whitespace, text.split(splitter)
6463

64+
def bisect_left(a: list, x: int, hi: int) -> int:
65+
lo = 0
66+
67+
while lo < hi:
68+
mid = (lo + hi) // 2
69+
70+
if a[mid] < x:
71+
lo = mid + 1
72+
73+
else:
74+
hi = mid
75+
76+
return lo
77+
6578
def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counter: Callable) -> tuple[int, str]:
6679
"""Merge splits until a chunk size is reached, returning the index of the last split included in the merged chunk along with the merged chunk itself."""
6780

@@ -72,7 +85,7 @@ def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counte
7285
cumulative_lengths.append(cumulative_lengths[-1])
7386

7487
while low < high:
75-
i = bisect_left(cumulative_lengths[low : high + 1], chunk_size * average)
88+
i = bisect_left(cumulative_lengths[low : high + 1], chunk_size * average, hi = (high - low) + 1)
7689
midpoint = min(i + low, high - 1)
7790

7891
tokens = token_counter(splitter.join(splits[:midpoint]))

0 commit comments

Comments
 (0)