37
37
from .embeddings import AzureOpenAIEmbeddingService , OpenAIEmbeddings
38
38
from .listfilestrategy import File
39
39
from .strategy import SearchInfo
40
- from .textsplitter import SplitPage
40
+ from .textsplitter import Chunk
41
41
42
42
logger = logging .getLogger ("scripts" )
43
43
@@ -47,8 +47,8 @@ class Section:
47
47
A section of a page that is stored in a search service. These sections are used as context by Azure OpenAI service
48
48
"""
49
49
50
- def __init__ (self , split_page : SplitPage , content : File , category : Optional [str ] = None ):
51
- self .split_page = split_page # content comes from here
50
+ def __init__ (self , chunk : Chunk , content : File , category : Optional [str ] = None ):
51
+ self .chunk = chunk # content comes from here
52
52
self .content = content # sourcepage and sourcefile come from here
53
53
self .category = category
54
54
# this also needs images which will become the images field
@@ -474,10 +474,10 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
474
474
documents = [
475
475
{
476
476
"id" : f"{ section .content .filename_to_id ()} -page-{ section_index + batch_index * MAX_BATCH_SIZE } " ,
477
- "content" : section .split_page .text ,
477
+ "content" : section .chunk .text ,
478
478
"category" : section .category ,
479
479
"sourcepage" : BlobManager .sourcepage_from_file_page (
480
- filename = section .content .filename (), page = section .split_page .page_num
480
+ filename = section .content .filename (), page = section .chunk .page_num
481
481
),
482
482
"sourcefile" : section .content .filename (),
483
483
"images" : [
@@ -487,7 +487,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
487
487
"boundingbox" : image .bbox ,
488
488
"embedding" : image .embedding ,
489
489
}
490
- for image in section .split_page .images
490
+ for image in section .chunk .images
491
491
],
492
492
** section .content .acls ,
493
493
}
@@ -500,7 +500,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
500
500
if self .field_name_embedding is None :
501
501
raise ValueError ("Embedding field name must be set" )
502
502
embeddings = await self .embeddings .create_embeddings (
503
- texts = [section .split_page .text for section in batch ]
503
+ texts = [section .chunk .text for section in batch ]
504
504
)
505
505
for i , document in enumerate (documents ):
506
506
document [self .field_name_embedding ] = embeddings [i ]
0 commit comments