Skip to content

Commit e835da3

Browse files
bug #1152: Fix for indexing small docs (#1155)
* Fix for indexing small docs * fix update * format using black * black formatting test check * fix for test_split_empty_pages() * run ruff * add test * reduce diff size --------- Co-authored-by: Matt Gotteiner <[email protected]>
1 parent 5e9d142 commit e835da3

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

scripts/prepdocslib/textsplitter.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,14 @@ def find_page(offset):
4141
return pages[num_pages - 1].page_num
4242

4343
all_text = "".join(page.text for page in pages)
44+
if len(all_text.strip()) == 0:
45+
return
46+
4447
length = len(all_text)
48+
if length <= self.max_section_length:
49+
yield SplitPage(page_num=find_page(0), text=all_text)
50+
return
51+
4552
start = 0
4653
end = length
4754
while start + self.section_overlap < length:

tests/test_prepdocslib_textsplitter.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from scripts.prepdocslib.listfilestrategy import LocalListFileStrategy
77
from scripts.prepdocslib.pdfparser import LocalPdfParser
88
from scripts.prepdocslib.searchmanager import Section
9-
from scripts.prepdocslib.textsplitter import TextSplitter
9+
from scripts.prepdocslib.textsplitter import Page, TextSplitter
1010

1111

1212
def test_split_empty_pages():
@@ -15,6 +15,15 @@ def test_split_empty_pages():
1515
assert list(t.split_pages([])) == []
1616

1717

18+
def test_split_small_pages():
19+
t = TextSplitter(has_image_embeddings=False, verbose=True)
20+
21+
split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text="Not a large page")]))
22+
assert len(split_pages) == 1
23+
assert split_pages[0].page_num == 0
24+
assert split_pages[0].text == "Not a large page"
25+
26+
1827
@pytest.mark.asyncio
1928
async def test_list_parse_and_split(tmp_path):
2029
text_splitter = TextSplitter(False, True)

0 commit comments

Comments
 (0)