Azure-Samples
diff --git a/‎app/backend/prepdocslib/filestrategy.py‎
Lines changed: 4 additions & 6 deletions b/‎app/backend/prepdocslib/filestrategy.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎app/backend/prepdocslib/page.py‎
Lines changed: 8 additions & 5 deletions b/‎app/backend/prepdocslib/page.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎app/backend/prepdocslib/searchmanager.py‎
Lines changed: 7 additions & 7 deletions b/‎app/backend/prepdocslib/searchmanager.py‎
Lines changed: 7 additions & 7 deletions
@@ -40,13 +40,11 @@ async def parse_file(
             if image_embeddings_client:
                 image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes)
     logger.info("Splitting '%s' into sections", file.filename())
-    sections = [
-        Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
-    ]
-    # For now, add the images back to each split page based off split_page.page_num
+    sections = [Section(chunk, content=file, category=category) for chunk in processor.splitter.split_pages(pages)]
+    # For now, add the images back to each split chunk based off chunk.page_num
     for section in sections:
-        section.split_page.images = [
-            image for page in pages if page.page_num == section.split_page.page_num for image in page.images
+        section.chunk.images = [
+            image for page in pages if page.page_num == section.chunk.page_num for image in page.images
         ]
     return sections
 
 
@@ -32,13 +32,16 @@ class Page:
 
 
 @dataclass
-class SplitPage:
-    """
-    A section of a page that has been split into a smaller chunk.
+class Chunk:
+    """Semantic chunk emitted by the splitter (may originate wholly within one page
+    or be the result of a cross-page merge / fragment shift).
 
     Attributes:
-        page_num (int): Page number (0-indexed)
-        text (str): The text of the section
+        page_num (int): Logical source page number (0-indexed) for the originating
+            portion of content. For merged content spanning pages we keep the earliest
+            contributing page number for stable attribution.
+        text (str): Textual content of the chunk.
+        images (list[ImageOnPage]): Images associated with this chunk, if any.
     """
 
     page_num: int
 
@@ -37,7 +37,7 @@
 from .embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddings
 from .listfilestrategy import File
 from .strategy import SearchInfo
-from .textsplitter import SplitPage
+from .textsplitter import Chunk
 
 logger = logging.getLogger("scripts")
 
@@ -47,8 +47,8 @@ class Section:
     A section of a page that is stored in a search service. These sections are used as context by Azure OpenAI service
     """
 
-    def __init__(self, split_page: SplitPage, content: File, category: Optional[str] = None):
-        self.split_page = split_page  # content comes from here
+    def __init__(self, chunk: Chunk, content: File, category: Optional[str] = None):
+        self.chunk = chunk  # content comes from here
         self.content = content  # sourcepage and sourcefile come from here
         self.category = category
         # this also needs images which will become the images field
@@ -474,10 +474,10 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
                 documents = [
                     {
                         "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
-                        "content": section.split_page.text,
+                        "content": section.chunk.text,
                         "category": section.category,
                         "sourcepage": BlobManager.sourcepage_from_file_page(
-                            filename=section.content.filename(), page=section.split_page.page_num
+                            filename=section.content.filename(), page=section.chunk.page_num
                         ),
                         "sourcefile": section.content.filename(),
                         "images": [
@@ -487,7 +487,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
                                 "boundingbox": image.bbox,
                                 "embedding": image.embedding,
                             }
-                            for image in section.split_page.images
+                            for image in section.chunk.images
                         ],
                         **section.content.acls,
                     }
@@ -500,7 +500,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
                     if self.field_name_embedding is None:
                         raise ValueError("Embedding field name must be set")
                     embeddings = await self.embeddings.create_embeddings(
-                        texts=[section.split_page.text for section in batch]
+                        texts=[section.chunk.text for section in batch]
                     )
                     for i, document in enumerate(documents):
                         document[self.field_name_embedding] = embeddings[i]