Skip to content

Commit a45774a

Browse files
authored
Ensure there are no zero-length sections for batch API (#1423)
* Assert there are no zero-length sections * Correct the overlap mechanism when the text section doesn't contain any breaks * Update textsplitter.py
1 parent d896376 commit a45774a

File tree

2 files changed

+5
-2
lines changed

2 files changed

+5
-2
lines changed

scripts/prepdocslib/textsplitter.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,10 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
127127
else:
128128
# Split page in half and call function again
129129
# Overlap first and second halves by DEFAULT_OVERLAP_PERCENT%
130-
first_half = text[: int(len(text) // (2.0 + (DEFAULT_OVERLAP_PERCENT / 100)))]
131-
second_half = text[int(len(text) // (1.0 - (DEFAULT_OVERLAP_PERCENT / 100))) :]
130+
middle = int(len(text) // 2)
131+
overlap = int(len(text) * (DEFAULT_OVERLAP_PERCENT / 100))
132+
first_half = text[: middle + overlap]
133+
second_half = text[middle - overlap :]
132134
yield from self.split_page_by_max_tokens(page_num, first_half)
133135
yield from self.split_page_by_max_tokens(page_num, second_half)
134136

tests/test_prepdocslib_textsplitter.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ async def test_sentencetextsplitter_multilang(test_doc, tmp_path):
120120
# Verify the size of the sections
121121
token_lengths = []
122122
for section in sections:
123+
assert section.split_page.text != ""
123124
assert len(section.split_page.text) <= (text_splitter.max_section_length * 1.2)
124125
# Verify the number of tokens is below 500
125126
token_lengths.append((len(bpe.encode(section.split_page.text)), len(section.split_page.text)))

0 commit comments

Comments
 (0)