Skip to content

Commit 4292c0d

Browse files
committed
Ensured merge_splits() high is not recomputed unnecessarily.
1 parent 569545d commit 4292c0d

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

src/semchunk/semchunk.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,12 @@ def bisect_left(sorted: list, target: int, low: int, high: int) -> int:
100100

101101

102102
def merge_splits(
103-
splits: list[str], cum_lens: list[int], chunk_size: int, splitter: str, token_counter: Callable, start: int
103+
splits: list[str], cum_lens: list[int], chunk_size: int, splitter: str, token_counter: Callable, start: int, high: int
104104
) -> tuple[int, str]:
105105
"""Merge splits until a chunk size is reached, returning the index of the last split included in the merged chunk along with the merged chunk itself."""
106106

107107
average = 0.2
108108
low = start
109-
high = len(splits) + 1
110109

111110
offset = cum_lens[start]
112111
target = offset + (chunk_size * average)
@@ -183,6 +182,7 @@ def chunk(
183182
cum_lens = list(accumulate(split_lens, initial=0))
184183
split_starts = accumulate([0] + [split_len + splitter_len for split_len in split_lens])
185184
split_starts = [start + _start for start in split_starts]
185+
num_splits_plus_one = len(splits) + 1
186186

187187
chunks = []
188188
skips = set()
@@ -218,6 +218,7 @@ def chunk(
218218
splitter=splitter,
219219
token_counter=token_counter,
220220
start=i,
221+
high=num_splits_plus_one,
221222
)
222223

223224
# Mark any splits included in the new chunk for exclusion from future chunks.

0 commit comments

Comments
 (0)