Skip to content

Commit 7fd64eb

Browse files
committed
Fix Divide by zero error #4
1 parent b010c04 commit 7fd64eb

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

src/semchunk/semchunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counte
7373

7474
tokens = token_counter(splitter.join(splits[:midpoint]))
7575

76-
average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] else average
76+
average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] and tokens > 0 else average
7777

7878
if tokens > chunk_size:
7979
high = midpoint
@@ -256,4 +256,4 @@ def chunker(text_or_texts: str | Sequence[str]) -> list[str] | list[list[str]]:
256256

257257
return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts]
258258

259-
return chunker
259+
return chunker

tests/test_semchunk.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,15 @@ def tiktoken_token_counter(text: str) -> int:
7979
except ValueError:
8080
worked = True
8181

82-
assert worked
82+
assert worked
83+
84+
85+
def test_merge_splits_zero_tokens() -> None:
86+
"""Test case where midpoint is 1 and the first split is whitespace.
87+
This can result in the token_counter returning 0 tokens. Ensure we don't divide by zero.
88+
"""
89+
try:
90+
result = semchunk.semchunk.merge_splits([' ', 'text here', 'more text'], 20, ' ', lambda s: 0)
91+
except ZeroDivisionError as exc:
92+
assert False, "merge_splits raised exception {exc}"
93+
assert result == (3, ' text here more text')

0 commit comments

Comments
 (0)