|
| 1 | +"""Tests for the text splitting logic in split_into_best_sentences.""" |
| 2 | + |
| 3 | +import pytest |
| 4 | + |
| 5 | +from pocket_tts.conditioners.text import get_default_tokenizer |
| 6 | +from pocket_tts.models.tts_model import split_into_best_sentences |
| 7 | + |
| 8 | + |
| 9 | +@pytest.fixture(scope="session") |
| 10 | +def tokenizer(): |
| 11 | + return get_default_tokenizer() |
| 12 | + |
| 13 | + |
| 14 | +def test_short_text_single_chunk(tokenizer): |
| 15 | + """Short text should produce a single chunk.""" |
| 16 | + chunks = split_into_best_sentences(tokenizer, "Hello world.", 50) |
| 17 | + assert len(chunks) == 1 |
| 18 | + |
| 19 | + |
| 20 | +def test_multiple_sentences_split(tokenizer): |
| 21 | + """Multiple sentences should be split when they exceed max_tokens.""" |
| 22 | + text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here." |
| 23 | + chunks = split_into_best_sentences(tokenizer, text, 10) |
| 24 | + assert len(chunks) > 1 |
| 25 | + |
| 26 | + |
| 27 | +def test_long_sentence_with_commas_is_split(tokenizer): |
| 28 | + """A long sentence with only commas (no periods) should be split on commas.""" |
| 29 | + # This is the core bug from issue #38 - the Tale of Two Cities example |
| 30 | + text = ( |
| 31 | + "It was the best of times, it was the worst of times, " |
| 32 | + "it was the age of wisdom, it was the age of foolishness, " |
| 33 | + "it was the epoch of belief, it was the epoch of incredulity, " |
| 34 | + "it was the season of Light, it was the season of Darkness, " |
| 35 | + "it was the spring of hope, it was the winter of despair" |
| 36 | + ) |
| 37 | + chunks = split_into_best_sentences(tokenizer, text, 50) |
| 38 | + assert len(chunks) > 1, "Long comma-separated text should be split into multiple chunks" |
| 39 | + |
| 40 | + # Verify all content is preserved (no words should be lost in splitting) |
| 41 | + rejoined = " ".join(chunks).lower() |
| 42 | + for phrase in ["best of times", "worst of times", "age of foolishness", "winter of despair"]: |
| 43 | + assert phrase in rejoined, f"'{phrase}' should be preserved after splitting" |
| 44 | + |
| 45 | + |
| 46 | +def test_long_sentence_with_commas_respects_max_tokens(tokenizer): |
| 47 | + """Each chunk from comma splitting should respect max_tokens (when possible).""" |
| 48 | + text = ( |
| 49 | + "It was the best of times, it was the worst of times, " |
| 50 | + "it was the age of wisdom, it was the age of foolishness, " |
| 51 | + "it was the epoch of belief, it was the epoch of incredulity" |
| 52 | + ) |
| 53 | + max_tokens = 20 |
| 54 | + chunks = split_into_best_sentences(tokenizer, text, max_tokens) |
| 55 | + for chunk in chunks: |
| 56 | + token_count = len(tokenizer(chunk.strip()).tokens[0].tolist()) |
| 57 | + # Allow some tolerance since comma clauses may vary in size |
| 58 | + assert token_count <= max_tokens * 2, ( |
| 59 | + f"Chunk '{chunk[:50]}...' has {token_count} tokens, expected ~{max_tokens}" |
| 60 | + ) |
| 61 | + |
| 62 | + |
| 63 | +def test_mixed_sentences_and_commas(tokenizer): |
| 64 | + """Text with both sentence boundaries and long comma-separated clauses.""" |
| 65 | + text = ( |
| 66 | + "Short sentence. " |
| 67 | + "This is a very long sentence with many clauses, separated by commas, " |
| 68 | + "that goes on and on, and on some more, without any periods at all, " |
| 69 | + "until it finally reaches a period. " |
| 70 | + "Another short one." |
| 71 | + ) |
| 72 | + chunks = split_into_best_sentences(tokenizer, text, 20) |
| 73 | + assert len(chunks) >= 3 |
| 74 | + |
| 75 | + |
| 76 | +def test_no_commas_no_periods_stays_single_chunk(tokenizer): |
| 77 | + """Text with no splitting characters stays as a single chunk.""" |
| 78 | + text = "one two three four five six seven eight nine ten eleven twelve" |
| 79 | + chunks = split_into_best_sentences(tokenizer, text, 5) |
| 80 | + # Should be 1 chunk since there are no split points |
| 81 | + assert len(chunks) == 1 |
| 82 | + |
| 83 | + |
| 84 | +def test_semicolons_and_colons_also_split(tokenizer): |
| 85 | + """Semicolons and colons should also serve as fallback split points.""" |
| 86 | + text = ( |
| 87 | + "First clause here; second clause here; third clause here; " |
| 88 | + "fourth clause here; fifth clause here; sixth clause here" |
| 89 | + ) |
| 90 | + chunks = split_into_best_sentences(tokenizer, text, 15) |
| 91 | + assert len(chunks) > 1 |
| 92 | + |
| 93 | + |
| 94 | +def test_short_sentence_not_affected_by_comma_splitting(tokenizer): |
| 95 | + """Sentences under max_tokens should not be affected by comma logic.""" |
| 96 | + text = "Hello, world." |
| 97 | + chunks = split_into_best_sentences(tokenizer, text, 50) |
| 98 | + assert len(chunks) == 1 |
| 99 | + assert "hello" in chunks[0].lower() |
| 100 | + assert "world" in chunks[0].lower() |
| 101 | + |
| 102 | + |
| 103 | +def test_empty_string_raises(tokenizer): |
| 104 | + """Empty input should raise ValueError from prepare_text_prompt.""" |
| 105 | + with pytest.raises(ValueError, match="empty"): |
| 106 | + split_into_best_sentences(tokenizer, "", 50) |
| 107 | + |
| 108 | + |
| 109 | +def test_oversized_clause_without_commas_still_returns(tokenizer): |
| 110 | + """A long clause with no split points should still be returned (not dropped).""" |
| 111 | + # 20 words with no punctuation at all - no way to split |
| 112 | + text = " ".join(f"word{i}" for i in range(20)) |
| 113 | + chunks = split_into_best_sentences(tokenizer, text, 5) |
| 114 | + assert len(chunks) == 1 |
| 115 | + # prepare_text_prompt capitalizes the first char and adds a trailing period, |
| 116 | + # so compare case-insensitively and strip punctuation |
| 117 | + assert chunks[0].lower().rstrip(".") == text.lower() |
0 commit comments