Split long sentences on commas to prevent skipped words (#143)

costajohnt · web-flow · commit ef69ab86521d · 2026-03-24T11:50:44.000+01:00
## Summary When a single sentence exceeds `max_tokens` (50) and has no sentence-ending punctuation (`.`, `!`, `?`), the text splitting logic now falls back to splitting on commas, semicolons, and colons. This prevents the model from silently skipping parts of long sentences. Fixes #38
diff --git a/pocket_tts/conditioners/text.py b/pocket_tts/conditioners/text.py
@@ -35,6 +35,21 @@ def __call__(self, text: str) -> TokenizedText:
         return TokenizedText(torch.tensor(self.sp.encode(text, out_type=int))[None, :])
 
 
+DEFAULT_TOKENIZER_N_BINS = 4000
+DEFAULT_TOKENIZER_PATH = (
+    "hf://kyutai/pocket-tts-without-voice-cloning/"
+    "tokenizer.model@d4fdd22ae8c8e1cb3634e150ebeff1dab2d16df3"
+)
+
+
+def get_default_tokenizer() -> SentencePieceTokenizer:
+    """Return a SentencePieceTokenizer with the default model path and vocab size.
+
+    Downloads the tokenizer model from HuggingFace on first use.
+    """
+    return SentencePieceTokenizer(DEFAULT_TOKENIZER_N_BINS, DEFAULT_TOKENIZER_PATH)
+
+
 class LUTConditioner(BaseConditioner):
     """Lookup table TextConditioner.
 
diff --git a/pocket_tts/models/tts_model.py b/pocket_tts/models/tts_model.py
@@ -838,39 +838,71 @@ def prepare_text_prompt(text: str) -> tuple[str, int]:
     return text, frames_after_eos_guess
 
 
+def _find_boundary_indices(list_of_tokens: list[int], boundary_tokens: list[int]) -> list[int]:
+    """Find token indices where text should be split based on boundary tokens.
+
+    Returns a list of boundary positions used to slice segments. Each consecutive
+    pair (indices[i], indices[i+1]) defines one segment. The first element is
+    always 0 and the last is always len(list_of_tokens).
+    """
+    indices = [0]
+    previous_was_boundary = False
+    for idx, token in enumerate(list_of_tokens):
+        if token in boundary_tokens:
+            previous_was_boundary = True
+        else:
+            if previous_was_boundary:
+                indices.append(idx)
+            previous_was_boundary = False
+    indices.append(len(list_of_tokens))
+    return indices
+
+
+def _segments_from_boundaries(
+    list_of_tokens: list[int], boundary_indices: list[int], tokenizer
+) -> list[tuple[int, str]]:
+    """Decode token segments between boundary indices into (token_count, text) pairs."""
+    segments = []
+    for i in range(len(boundary_indices) - 1):
+        start = boundary_indices[i]
+        end = boundary_indices[i + 1]
+        text = tokenizer.sp.decode(list_of_tokens[start:end])
+        segments.append((end - start, text))
+    return segments
+
+
 def split_into_best_sentences(tokenizer, text_to_generate: str, max_tokens: int) -> list[str]:
     text_to_generate, _ = prepare_text_prompt(text_to_generate)
     text_to_generate = text_to_generate.strip()
     tokens = tokenizer(text_to_generate)
     list_of_tokens = tokens.tokens[0].tolist()
 
     _, *end_of_sentence_tokens = tokenizer(".!...?").tokens[0].tolist()
-
-    end_of_sentences_indices = [0]
-    previous_was_end_of_sentence_token = False
-
-    for token_idx, token in enumerate(list_of_tokens):
-        if token in end_of_sentence_tokens:
-            previous_was_end_of_sentence_token = True
+    sentence_boundaries = _find_boundary_indices(list_of_tokens, end_of_sentence_tokens)
+    nb_tokens_and_sentences = _segments_from_boundaries(
+        list_of_tokens, sentence_boundaries, tokenizer
+    )
+
+    # Sub-split oversized sentences on commas, semicolons, and colons to prevent skipped words
+    _, *fallback_tokens = tokenizer(",;:").tokens[0].tolist()
+    refined_segments = []
+    for nb_tokens, text in nb_tokens_and_sentences:
+        if nb_tokens <= max_tokens:
+            refined_segments.append((nb_tokens, text))
         else:
-            if previous_was_end_of_sentence_token:
-                end_of_sentences_indices.append(token_idx)
-            previous_was_end_of_sentence_token = False
-    end_of_sentences_indices.append(len(list_of_tokens))
-
-    nb_tokens_and_sentences = []
-    for i in range(len(end_of_sentences_indices) - 1):
-        # let's print
-        start = end_of_sentences_indices[i]
-        end = end_of_sentences_indices[i + 1]
-        text = tokenizer.sp.decode(list_of_tokens[start:end])
-        nb_tokens_and_sentences.append((end - start, text))
+            sub_tokens = tokenizer(text.strip()).tokens[0].tolist()
+            sub_boundaries = _find_boundary_indices(sub_tokens, fallback_tokens)
+            sub_segments = _segments_from_boundaries(sub_tokens, sub_boundaries, tokenizer)
+            if len(sub_segments) > 1:
+                refined_segments.extend(sub_segments)
+            else:
+                refined_segments.append((nb_tokens, text))
 
     max_nb_tokens_in_a_chunk = max_tokens
     chunks = []
     current_chunk = ""
     current_nb_of_tokens_in_chunk = 0
-    for nb_tokens, sentence in nb_tokens_and_sentences:
+    for nb_tokens, sentence in refined_segments:
         if current_chunk == "":
             current_chunk = sentence
             current_nb_of_tokens_in_chunk = nb_tokens
@@ -887,6 +919,16 @@ def split_into_best_sentences(tokenizer, text_to_generate: str, max_tokens: int)
     if current_chunk != "":
         chunks.append(current_chunk.strip())
 
+    for chunk in chunks:
+        chunk_tokens = tokenizer(chunk.strip()).tokens[0].tolist()
+        if len(chunk_tokens) > max_tokens:
+            logger.warning(
+                "Chunk has %d tokens (max %d), generation may skip words: '%.50s...'",
+                len(chunk_tokens),
+                max_tokens,
+                chunk,
+            )
+
     return chunks
 
 
diff --git a/tests/test_split_sentences.py b/tests/test_split_sentences.py
@@ -0,0 +1,117 @@
+"""Tests for the text splitting logic in split_into_best_sentences."""
+
+import pytest
+
+from pocket_tts.conditioners.text import get_default_tokenizer
+from pocket_tts.models.tts_model import split_into_best_sentences
+
+
+@pytest.fixture(scope="session")
+def tokenizer():
+    return get_default_tokenizer()
+
+
+def test_short_text_single_chunk(tokenizer):
+    """Short text should produce a single chunk."""
+    chunks = split_into_best_sentences(tokenizer, "Hello world.", 50)
+    assert len(chunks) == 1
+
+
+def test_multiple_sentences_split(tokenizer):
+    """Multiple sentences should be split when they exceed max_tokens."""
+    text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here."
+    chunks = split_into_best_sentences(tokenizer, text, 10)
+    assert len(chunks) > 1
+
+
+def test_long_sentence_with_commas_is_split(tokenizer):
+    """A long sentence with only commas (no periods) should be split on commas."""
+    # This is the core bug from issue #38 - the Tale of Two Cities example
+    text = (
+        "It was the best of times, it was the worst of times, "
+        "it was the age of wisdom, it was the age of foolishness, "
+        "it was the epoch of belief, it was the epoch of incredulity, "
+        "it was the season of Light, it was the season of Darkness, "
+        "it was the spring of hope, it was the winter of despair"
+    )
+    chunks = split_into_best_sentences(tokenizer, text, 50)
+    assert len(chunks) > 1, "Long comma-separated text should be split into multiple chunks"
+
+    # Verify all content is preserved (no words should be lost in splitting)
+    rejoined = " ".join(chunks).lower()
+    for phrase in ["best of times", "worst of times", "age of foolishness", "winter of despair"]:
+        assert phrase in rejoined, f"'{phrase}' should be preserved after splitting"
+
+
+def test_long_sentence_with_commas_respects_max_tokens(tokenizer):
+    """Each chunk from comma splitting should respect max_tokens (when possible)."""
+    text = (
+        "It was the best of times, it was the worst of times, "
+        "it was the age of wisdom, it was the age of foolishness, "
+        "it was the epoch of belief, it was the epoch of incredulity"
+    )
+    max_tokens = 20
+    chunks = split_into_best_sentences(tokenizer, text, max_tokens)
+    for chunk in chunks:
+        token_count = len(tokenizer(chunk.strip()).tokens[0].tolist())
+        # Allow some tolerance since comma clauses may vary in size
+        assert token_count <= max_tokens * 2, (
+            f"Chunk '{chunk[:50]}...' has {token_count} tokens, expected ~{max_tokens}"
+        )
+
+
+def test_mixed_sentences_and_commas(tokenizer):
+    """Text with both sentence boundaries and long comma-separated clauses."""
+    text = (
+        "Short sentence. "
+        "This is a very long sentence with many clauses, separated by commas, "
+        "that goes on and on, and on some more, without any periods at all, "
+        "until it finally reaches a period. "
+        "Another short one."
+    )
+    chunks = split_into_best_sentences(tokenizer, text, 20)
+    assert len(chunks) >= 3
+
+
+def test_no_commas_no_periods_stays_single_chunk(tokenizer):
+    """Text with no splitting characters stays as a single chunk."""
+    text = "one two three four five six seven eight nine ten eleven twelve"
+    chunks = split_into_best_sentences(tokenizer, text, 5)
+    # Should be 1 chunk since there are no split points
+    assert len(chunks) == 1
+
+
+def test_semicolons_and_colons_also_split(tokenizer):
+    """Semicolons and colons should also serve as fallback split points."""
+    text = (
+        "First clause here; second clause here; third clause here; "
+        "fourth clause here; fifth clause here; sixth clause here"
+    )
+    chunks = split_into_best_sentences(tokenizer, text, 15)
+    assert len(chunks) > 1
+
+
+def test_short_sentence_not_affected_by_comma_splitting(tokenizer):
+    """Sentences under max_tokens should not be affected by comma logic."""
+    text = "Hello, world."
+    chunks = split_into_best_sentences(tokenizer, text, 50)
+    assert len(chunks) == 1
+    assert "hello" in chunks[0].lower()
+    assert "world" in chunks[0].lower()
+
+
+def test_empty_string_raises(tokenizer):
+    """Empty input should raise ValueError from prepare_text_prompt."""
+    with pytest.raises(ValueError, match="empty"):
+        split_into_best_sentences(tokenizer, "", 50)
+
+
+def test_oversized_clause_without_commas_still_returns(tokenizer):
+    """A long clause with no split points should still be returned (not dropped)."""
+    # 20 words with no punctuation at all - no way to split
+    text = " ".join(f"word{i}" for i in range(20))
+    chunks = split_into_best_sentences(tokenizer, text, 5)
+    assert len(chunks) == 1
+    # prepare_text_prompt capitalizes the first char and adds a trailing period,
+    # so compare case-insensitively and strip punctuation
+    assert chunks[0].lower().rstrip(".") == text.lower()