Skip to content

Commit 5a6b748

Browse files
committed
Use single token char
1 parent e7f368c commit 5a6b748

File tree

1 file changed

+25
-32
lines changed

1 file changed

+25
-32
lines changed

tests/test_prepdocslib_textsplitter.py

Lines changed: 25 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,13 @@
1515
SimpleTextSplitter,
1616
)
1717

18-
# Multi-token deterministic character (2 tokens in current model) used to create
19-
# token pressure without excessive string length. Guard at import time so failures
20-
# surface clearly if tokenizer behavior changes.
21-
TWO_TOKEN_CHAR = "Ѐ"
18+
# Deterministic single-token character used to create token pressure by repetition
19+
# while keeping tests readable (1 char = 1 token).
20+
SINGLE_TOKEN_CHAR = "¢" # 1 token under cl100k_base
2221
_bpe_for_guard = tiktoken.encoding_for_model(ENCODING_MODEL)
23-
assert len(_bpe_for_guard.encode(TWO_TOKEN_CHAR)) == 2, (
24-
f"Invariant changed: {TWO_TOKEN_CHAR!r} no longer encodes to 2 tokens under {ENCODING_MODEL}; "
25-
"adjust tests to a different stable multi-token char."
22+
assert len(_bpe_for_guard.encode(SINGLE_TOKEN_CHAR)) == 1, (
23+
f"Invariant changed: {SINGLE_TOKEN_CHAR!r} no longer encodes to 1 token under {ENCODING_MODEL}; "
24+
"adjust tests to a different stable single-token char."
2625
)
2726

2827

@@ -251,8 +250,8 @@ def test_unbalanced_figure_treated_as_text():
251250
def test_oversize_single_sentence_recursion():
252251
"""A single oversized sentence (no punctuation) should be recursively split by token logic."""
253252
splitter = SentenceTextSplitter(max_tokens_per_section=50)
254-
# Use TWO_TOKEN_CHAR to exceed token limit with fewer characters: 120 chars -> 240 tokens > 50
255-
long_run = TWO_TOKEN_CHAR * 120
253+
# Use SINGLE_TOKEN_CHAR repetition to exceed token limit: 120 chars -> 120 tokens > 50
254+
long_run = SINGLE_TOKEN_CHAR * 120
256255
page = Page(page_num=0, offset=0, text=long_run + ".")
257256
chunks = list(splitter.split_pages([page]))
258257
assert len(chunks) > 1, "Expected recursive splitting for oversized sentence"
@@ -366,16 +365,16 @@ def test_cross_page_merge_fragment_shift_hard_trim():
366365
"""Exercise hard trim branch where fragment must be aggressively shortened (token loop)."""
367366
splitter = SentenceTextSplitter(max_tokens_per_section=40)
368367
splitter.max_section_length = 100
369-
# Use multi-token char run to create a large fragment (260 * 2 = 520 tokens) with minimal punctuation.
368+
# Use repeated single-token char run to create a large fragment (260 tokens) with minimal punctuation.
370369
fragment_run_len = 260
371-
fragment_run = TWO_TOKEN_CHAR * fragment_run_len
370+
fragment_run = SINGLE_TOKEN_CHAR * fragment_run_len
372371
prev_text = "Start. " + fragment_run
373372
page1 = Page(page_num=0, offset=0, text=prev_text)
374373
# Next page small continuation
375374
page2 = Page(page_num=1, offset=0, text="continuation")
376375
chunks = list(splitter.split_pages([page1, page2]))
377376
# Ensure that some fragment run has been moved but also trimmed (shorter than original)
378-
moved_runs = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)]
377+
moved_runs = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)]
379378
if moved_runs:
380379
assert all(len(run) < fragment_run_len for run in moved_runs), "Expected hard trim to shorten fragment"
381380
# Ensure we still have a chunk starting with 'Start.' retained portion
@@ -443,43 +442,37 @@ def test_fragment_shift_token_limit_fits_false():
443442
# Configure large char allowance so only token constraint matters.
444443
splitter = SentenceTextSplitter(max_tokens_per_section=50)
445444
splitter.max_section_length = 5000 # very high to avoid char-based fits() failure
446-
# Build fragment via multi-token char repetition (120 chars -> 240 tokens) beyond the 50-token limit.
447-
fragment = TWO_TOKEN_CHAR * 120 # no terminating punctuation
445+
# Build fragment via single-token char repetition (120 tokens) beyond the 50-token limit.
446+
fragment = SINGLE_TOKEN_CHAR * 120 # no terminating punctuation
448447
prev_text = "Intro sentence." + fragment # last sentence end ensures fragment_start > 0
449448
page1 = Page(page_num=0, offset=0, text=prev_text)
450449
# Next page starts lowercase to trigger merge attempt; small first_new keeps emphasis on fragment tokens.
451450
page2 = Page(page_num=1, offset=0, text="cont tail")
452451
chunks = list(splitter.split_pages([page1, page2]))
453452
# Retained intro sentence should appear.
454453
assert any(c.text.startswith("Intro sentence.") for c in chunks)
455-
# A moved fragment portion beginning with TWO_TOKEN_CHAR should appear but be trimmed
456-
moved = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)]
454+
# A moved fragment portion beginning with SINGLE_TOKEN_CHAR should appear but be trimmed
455+
moved = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)]
457456
if moved:
458457
assert all(len(m) < len(fragment) for m in moved), "Expected trimmed fragment shorter than original"
459458

460459

461-
def test_fragment_shift_token_limit_multi_token_char():
462-
"""Deterministically force token overflow using a multi‑token Unicode char (Ѐ = 2 tokens per char).
463-
Repeats of 'Ѐ' rapidly exceed the token limit while keeping char length well below char threshold,
464-
exercising the fits() token-limit branch and trimming loop without monkeypatching the encoder.
460+
def test_fragment_shift_token_limit_single_token_char():
461+
"""Deterministically force token overflow using the single-token pressure char.
462+
Repeats exceed the token limit while keeping 1:1 char/token mapping for simpler assertions.
465463
"""
466464
splitter = SentenceTextSplitter(max_tokens_per_section=80)
467465
splitter.max_section_length = 5000 # ensure only token constraint matters
468-
multi_token_char = "Ѐ" # currently 2 tokens in text-embedding-ada-002
469-
# Guardrail: fail fast if tokenizer changes and Ѐ stops being 2 tokens (adjust test strategy then)
466+
pressure_char = SINGLE_TOKEN_CHAR # now single-token char
470467
bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
471-
assert len(bpe.encode(multi_token_char)) == 2, (
472-
f"Invariant changed: 'Ѐ' is {len(bpe.encode(multi_token_char))} tokens for model {ENCODING_MODEL}; "
473-
"update test strategy (choose a different stable multi-token char)."
474-
)
475-
fragment = multi_token_char * 400 # ~800 tokens > 80 token limit
476-
prev_text = "Intro sentence." + fragment # ensures fragment_start > 0 (there is a prior sentence end)
468+
assert len(bpe.encode(pressure_char)) == 1
469+
fragment = pressure_char * 400 # 400 tokens > 80 token limit
470+
prev_text = "Intro sentence." + fragment # ensures fragment_start > 0 (prior sentence end)
477471
page1 = Page(page_num=0, offset=0, text=prev_text)
478472
page2 = Page(page_num=1, offset=0, text="cont tail")
479473
chunks = list(splitter.split_pages([page1, page2]))
480474
assert any(c.text.startswith("Intro sentence.") for c in chunks)
481-
moved = [c.text for c in chunks if c.text and c.text[0] == multi_token_char]
482-
# Trim loop should reduce moved fragment below original full fragment length
475+
moved = [c.text for c in chunks if c.text and c.text[0] == pressure_char]
483476
if moved:
484477
assert all(len(m) < len(fragment) for m in moved), "Expected trimmed fragment shorter than original"
485478

@@ -538,11 +531,11 @@ def test_cross_page_fragment_hard_trim_iterative():
538531
splitter = SentenceTextSplitter(max_tokens_per_section=30)
539532
splitter.max_section_length = 80
540533
fragment_iter_len = 210 # 420 tokens
541-
prev = "Intro. " + (TWO_TOKEN_CHAR * fragment_iter_len)
534+
prev = "Intro. " + (SINGLE_TOKEN_CHAR * fragment_iter_len)
542535
page1 = Page(page_num=0, offset=0, text=prev)
543536
page2 = Page(page_num=1, offset=0, text="continuation lower start")
544537
chunks = list(splitter.split_pages([page1, page2]))
545538
# Some trimmed fragment should appear but much shorter than original
546-
trimmed = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)]
539+
trimmed = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)]
547540
if trimmed:
548541
assert all(len(t) < fragment_iter_len for t in trimmed)

0 commit comments

Comments
 (0)