Skip to content

Commit 23d2e38

Browse files
authored
Fix overlap percentage calculation (#1668)
* Fix overlap calculation * Add test for initialization of section_overlap
1 parent 7ffcb3b commit 23d2e38

File tree

2 files changed

+24
-1
lines changed

2 files changed

+24
-1
lines changed

app/backend/prepdocslib/textsplitter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def __init__(self, has_image_embeddings: bool, max_tokens_per_section: int = 500
9393
self.max_section_length = DEFAULT_SECTION_LENGTH
9494
self.sentence_search_limit = 100
9595
self.max_tokens_per_section = max_tokens_per_section
96-
self.section_overlap = self.max_section_length // DEFAULT_OVERLAP_PERCENT
96+
self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100)
9797
self.has_image_embeddings = has_image_embeddings
9898

9999
def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:

tests/test_sentencetextsplitter.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from unittest.mock import patch
2+
3+
import pytest
4+
5+
from prepdocslib.textsplitter import SentenceTextSplitter
6+
7+
8+
@pytest.mark.parametrize(
9+
"actual_percentage, expected_section_overlap",
10+
[
11+
(100, 1000),
12+
(80, 800),
13+
(10.75, 107),
14+
(10, 100),
15+
(0, 0),
16+
],
17+
)
18+
def test_sentence_text_splitter_initializes_overlap_correctly(
19+
actual_percentage: float, expected_section_overlap: float
20+
):
21+
with patch("prepdocslib.textsplitter.DEFAULT_OVERLAP_PERCENT", actual_percentage):
22+
subject = SentenceTextSplitter(False)
23+
assert subject.section_overlap == expected_section_overlap

0 commit comments

Comments
 (0)