Fix Divide by zero error #4

jcobol · jcobol · commit 7fd64eb8cf51 · 2024-05-27T00:35:53.000-04:00
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -73,7 +73,7 @@ def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counte
 
         tokens = token_counter(splitter.join(splits[:midpoint]))
 
-        average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] else average
+        average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] and tokens > 0 else average
 
         if tokens > chunk_size:
             high = midpoint
@@ -256,4 +256,4 @@ def chunker(text_or_texts: str | Sequence[str]) -> list[str] | list[list[str]]:
         
         return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts]
     
-    return chunker
+    return chunker
diff --git a/tests/test_semchunk.py b/tests/test_semchunk.py
@@ -79,4 +79,15 @@ def tiktoken_token_counter(text: str) -> int:
     except ValueError:
         worked = True
     
-    assert worked
+    assert worked
+
+
+def test_merge_splits_zero_tokens() -> None:
+    """Test case where midpoint is 1 and the first split is whitespace.
+    This can result in the token_counter returning 0 tokens. Ensure we don't divide by zero.
+    """
+    try:
+        result = semchunk.semchunk.merge_splits(['    ', 'text here', 'more text'], 20, ' ', lambda s: 0)
+    except ZeroDivisionError as exc:
+        assert False, "merge_splits raised exception {exc}"
+    assert result == (3, '     text here more text')