Skip to content

Commit a2ff14a

Browse files
committed
Update splitting algorithm with better overlap algorithm, rename SplitPage to Chunk
1 parent 7cebf13 commit a2ff14a

File tree

10 files changed

+1287
-737
lines changed

10 files changed

+1287
-737
lines changed

app/backend/prepdocslib/filestrategy.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,11 @@ async def parse_file(
4040
if image_embeddings_client:
4141
image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes)
4242
logger.info("Splitting '%s' into sections", file.filename())
43-
sections = [
44-
Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
45-
]
46-
# For now, add the images back to each split page based off split_page.page_num
43+
sections = [Section(chunk, content=file, category=category) for chunk in processor.splitter.split_pages(pages)]
44+
# For now, add the images back to each split chunk based off chunk.page_num
4745
for section in sections:
48-
section.split_page.images = [
49-
image for page in pages if page.page_num == section.split_page.page_num for image in page.images
46+
section.chunk.images = [
47+
image for page in pages if page.page_num == section.chunk.page_num for image in page.images
5048
]
5149
return sections
5250

app/backend/prepdocslib/page.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,16 @@ class Page:
3232

3333

3434
@dataclass
35-
class SplitPage:
36-
"""
37-
A section of a page that has been split into a smaller chunk.
35+
class Chunk:
36+
"""Semantic chunk emitted by the splitter (may originate wholly within one page
37+
or be the result of a cross-page merge / fragment shift).
3838
3939
Attributes:
40-
page_num (int): Page number (0-indexed)
41-
text (str): The text of the section
40+
page_num (int): Logical source page number (0-indexed) for the originating
41+
portion of content. For merged content spanning pages we keep the earliest
42+
contributing page number for stable attribution.
43+
text (str): Textual content of the chunk.
44+
images (list[ImageOnPage]): Images associated with this chunk, if any.
4245
"""
4346

4447
page_num: int

app/backend/prepdocslib/searchmanager.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from .embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddings
3838
from .listfilestrategy import File
3939
from .strategy import SearchInfo
40-
from .textsplitter import SplitPage
40+
from .textsplitter import Chunk
4141

4242
logger = logging.getLogger("scripts")
4343

@@ -47,8 +47,8 @@ class Section:
4747
A section of a page that is stored in a search service. These sections are used as context by Azure OpenAI service
4848
"""
4949

50-
def __init__(self, split_page: SplitPage, content: File, category: Optional[str] = None):
51-
self.split_page = split_page # content comes from here
50+
def __init__(self, chunk: Chunk, content: File, category: Optional[str] = None):
51+
self.chunk = chunk # content comes from here
5252
self.content = content # sourcepage and sourcefile come from here
5353
self.category = category
5454
# this also needs images which will become the images field
@@ -474,10 +474,10 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
474474
documents = [
475475
{
476476
"id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
477-
"content": section.split_page.text,
477+
"content": section.chunk.text,
478478
"category": section.category,
479479
"sourcepage": BlobManager.sourcepage_from_file_page(
480-
filename=section.content.filename(), page=section.split_page.page_num
480+
filename=section.content.filename(), page=section.chunk.page_num
481481
),
482482
"sourcefile": section.content.filename(),
483483
"images": [
@@ -487,7 +487,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
487487
"boundingbox": image.bbox,
488488
"embedding": image.embedding,
489489
}
490-
for image in section.split_page.images
490+
for image in section.chunk.images
491491
],
492492
**section.content.acls,
493493
}
@@ -500,7 +500,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
500500
if self.field_name_embedding is None:
501501
raise ValueError("Embedding field name must be set")
502502
embeddings = await self.embeddings.create_embeddings(
503-
texts=[section.split_page.text for section in batch]
503+
texts=[section.chunk.text for section in batch]
504504
)
505505
for i, document in enumerate(documents):
506506
document[self.field_name_embedding] = embeddings[i]

0 commit comments

Comments
 (0)