|
15 | 15 | SimpleTextSplitter,
|
16 | 16 | )
|
17 | 17 |
|
18 |
| -# Multi-token deterministic character (2 tokens in current model) used to create |
19 |
| -# token pressure without excessive string length. Guard at import time so failures |
20 |
| -# surface clearly if tokenizer behavior changes. |
21 |
| -TWO_TOKEN_CHAR = "Ѐ" |
| 18 | +# Deterministic single-token character used to create token pressure by repetition |
| 19 | +# while keeping tests readable (1 char = 1 token). |
| 20 | +SINGLE_TOKEN_CHAR = "¢" # 1 token under cl100k_base |
22 | 21 | _bpe_for_guard = tiktoken.encoding_for_model(ENCODING_MODEL)
|
23 |
| -assert len(_bpe_for_guard.encode(TWO_TOKEN_CHAR)) == 2, ( |
24 |
| - f"Invariant changed: {TWO_TOKEN_CHAR!r} no longer encodes to 2 tokens under {ENCODING_MODEL}; " |
25 |
| - "adjust tests to a different stable multi-token char." |
| 22 | +assert len(_bpe_for_guard.encode(SINGLE_TOKEN_CHAR)) == 1, ( |
| 23 | + f"Invariant changed: {SINGLE_TOKEN_CHAR!r} no longer encodes to 1 token under {ENCODING_MODEL}; " |
| 24 | + "adjust tests to a different stable single-token char." |
26 | 25 | )
|
27 | 26 |
|
28 | 27 |
|
@@ -251,8 +250,8 @@ def test_unbalanced_figure_treated_as_text():
|
251 | 250 | def test_oversize_single_sentence_recursion():
|
252 | 251 | """A single oversized sentence (no punctuation) should be recursively split by token logic."""
|
253 | 252 | splitter = SentenceTextSplitter(max_tokens_per_section=50)
|
254 |
| - # Use TWO_TOKEN_CHAR to exceed token limit with fewer characters: 120 chars -> 240 tokens > 50 |
255 |
| - long_run = TWO_TOKEN_CHAR * 120 |
| 253 | + # Use SINGLE_TOKEN_CHAR repetition to exceed token limit: 120 chars -> 120 tokens > 50 |
| 254 | + long_run = SINGLE_TOKEN_CHAR * 120 |
256 | 255 | page = Page(page_num=0, offset=0, text=long_run + ".")
|
257 | 256 | chunks = list(splitter.split_pages([page]))
|
258 | 257 | assert len(chunks) > 1, "Expected recursive splitting for oversized sentence"
|
@@ -366,16 +365,16 @@ def test_cross_page_merge_fragment_shift_hard_trim():
|
366 | 365 | """Exercise hard trim branch where fragment must be aggressively shortened (token loop)."""
|
367 | 366 | splitter = SentenceTextSplitter(max_tokens_per_section=40)
|
368 | 367 | splitter.max_section_length = 100
|
369 |
| - # Use multi-token char run to create a large fragment (260 * 2 = 520 tokens) with minimal punctuation. |
| 368 | + # Use repeated single-token char run to create a large fragment (260 tokens) with minimal punctuation. |
370 | 369 | fragment_run_len = 260
|
371 |
| - fragment_run = TWO_TOKEN_CHAR * fragment_run_len |
| 370 | + fragment_run = SINGLE_TOKEN_CHAR * fragment_run_len |
372 | 371 | prev_text = "Start. " + fragment_run
|
373 | 372 | page1 = Page(page_num=0, offset=0, text=prev_text)
|
374 | 373 | # Next page small continuation
|
375 | 374 | page2 = Page(page_num=1, offset=0, text="continuation")
|
376 | 375 | chunks = list(splitter.split_pages([page1, page2]))
|
377 | 376 | # Ensure that some fragment run has been moved but also trimmed (shorter than original)
|
378 |
| - moved_runs = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)] |
| 377 | + moved_runs = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)] |
379 | 378 | if moved_runs:
|
380 | 379 | assert all(len(run) < fragment_run_len for run in moved_runs), "Expected hard trim to shorten fragment"
|
381 | 380 | # Ensure we still have a chunk starting with 'Start.' retained portion
|
@@ -443,43 +442,37 @@ def test_fragment_shift_token_limit_fits_false():
|
443 | 442 | # Configure large char allowance so only token constraint matters.
|
444 | 443 | splitter = SentenceTextSplitter(max_tokens_per_section=50)
|
445 | 444 | splitter.max_section_length = 5000 # very high to avoid char-based fits() failure
|
446 |
| - # Build fragment via multi-token char repetition (120 chars -> 240 tokens) beyond the 50-token limit. |
447 |
| - fragment = TWO_TOKEN_CHAR * 120 # no terminating punctuation |
| 445 | + # Build fragment via single-token char repetition (120 tokens) beyond the 50-token limit. |
| 446 | + fragment = SINGLE_TOKEN_CHAR * 120 # no terminating punctuation |
448 | 447 | prev_text = "Intro sentence." + fragment # last sentence end ensures fragment_start > 0
|
449 | 448 | page1 = Page(page_num=0, offset=0, text=prev_text)
|
450 | 449 | # Next page starts lowercase to trigger merge attempt; small first_new keeps emphasis on fragment tokens.
|
451 | 450 | page2 = Page(page_num=1, offset=0, text="cont tail")
|
452 | 451 | chunks = list(splitter.split_pages([page1, page2]))
|
453 | 452 | # Retained intro sentence should appear.
|
454 | 453 | assert any(c.text.startswith("Intro sentence.") for c in chunks)
|
455 |
| - # A moved fragment portion beginning with TWO_TOKEN_CHAR should appear but be trimmed |
456 |
| - moved = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)] |
| 454 | + # A moved fragment portion beginning with SINGLE_TOKEN_CHAR should appear but be trimmed |
| 455 | + moved = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)] |
457 | 456 | if moved:
|
458 | 457 | assert all(len(m) < len(fragment) for m in moved), "Expected trimmed fragment shorter than original"
|
459 | 458 |
|
460 | 459 |
|
461 |
| -def test_fragment_shift_token_limit_multi_token_char(): |
462 |
| - """Deterministically force token overflow using a multi‑token Unicode char (Ѐ = 2 tokens per char). |
463 |
| - Repeats of 'Ѐ' rapidly exceed the token limit while keeping char length well below char threshold, |
464 |
| - exercising the fits() token-limit branch and trimming loop without monkeypatching the encoder. |
| 460 | +def test_fragment_shift_token_limit_single_token_char(): |
| 461 | + """Deterministically force token overflow using the single-token pressure char. |
| 462 | + Repeats exceed the token limit while keeping 1:1 char/token mapping for simpler assertions. |
465 | 463 | """
|
466 | 464 | splitter = SentenceTextSplitter(max_tokens_per_section=80)
|
467 | 465 | splitter.max_section_length = 5000 # ensure only token constraint matters
|
468 |
| - multi_token_char = "Ѐ" # currently 2 tokens in text-embedding-ada-002 |
469 |
| - # Guardrail: fail fast if tokenizer changes and Ѐ stops being 2 tokens (adjust test strategy then) |
| 466 | + pressure_char = SINGLE_TOKEN_CHAR # now single-token char |
470 | 467 | bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
|
471 |
| - assert len(bpe.encode(multi_token_char)) == 2, ( |
472 |
| - f"Invariant changed: 'Ѐ' is {len(bpe.encode(multi_token_char))} tokens for model {ENCODING_MODEL}; " |
473 |
| - "update test strategy (choose a different stable multi-token char)." |
474 |
| - ) |
475 |
| - fragment = multi_token_char * 400 # ~800 tokens > 80 token limit |
476 |
| - prev_text = "Intro sentence." + fragment # ensures fragment_start > 0 (there is a prior sentence end) |
| 468 | + assert len(bpe.encode(pressure_char)) == 1 |
| 469 | + fragment = pressure_char * 400 # 400 tokens > 80 token limit |
| 470 | + prev_text = "Intro sentence." + fragment # ensures fragment_start > 0 (prior sentence end) |
477 | 471 | page1 = Page(page_num=0, offset=0, text=prev_text)
|
478 | 472 | page2 = Page(page_num=1, offset=0, text="cont tail")
|
479 | 473 | chunks = list(splitter.split_pages([page1, page2]))
|
480 | 474 | assert any(c.text.startswith("Intro sentence.") for c in chunks)
|
481 |
| - moved = [c.text for c in chunks if c.text and c.text[0] == multi_token_char] |
482 |
| - # Trim loop should reduce moved fragment below original full fragment length |
| 475 | + moved = [c.text for c in chunks if c.text and c.text[0] == pressure_char] |
483 | 476 | if moved:
|
484 | 477 | assert all(len(m) < len(fragment) for m in moved), "Expected trimmed fragment shorter than original"
|
485 | 478 |
|
@@ -538,11 +531,11 @@ def test_cross_page_fragment_hard_trim_iterative():
|
538 | 531 | splitter = SentenceTextSplitter(max_tokens_per_section=30)
|
539 | 532 | splitter.max_section_length = 80
|
540 | 533 | fragment_iter_len = 210 # 420 tokens
|
541 |
| - prev = "Intro. " + (TWO_TOKEN_CHAR * fragment_iter_len) |
| 534 | + prev = "Intro. " + (SINGLE_TOKEN_CHAR * fragment_iter_len) |
542 | 535 | page1 = Page(page_num=0, offset=0, text=prev)
|
543 | 536 | page2 = Page(page_num=1, offset=0, text="continuation lower start")
|
544 | 537 | chunks = list(splitter.split_pages([page1, page2]))
|
545 | 538 | # Some trimmed fragment should appear but much shorter than original
|
546 |
| - trimmed = [c.text for c in chunks if c.text.startswith(TWO_TOKEN_CHAR)] |
| 539 | + trimmed = [c.text for c in chunks if c.text.startswith(SINGLE_TOKEN_CHAR)] |
547 | 540 | if trimmed:
|
548 | 541 | assert all(len(t) < fragment_iter_len for t in trimmed)
|
0 commit comments