Skip to content

Commit d896376

Browse files
authored
Don't disable text chunking when GPT4vision is enabled (#1355)
* Dont disable chunking when using vision, graceful degrade * Adding test
1 parent 87f2b9d commit d896376

File tree

12 files changed

+185
-33
lines changed

12 files changed

+185
-33
lines changed

app/backend/core/imageshelper.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import base64
2+
import logging
23
import math
34
import os
45
import re
56
from io import BytesIO
67
from typing import Optional
78

9+
from azure.core.exceptions import ResourceNotFoundError
810
from azure.storage.blob.aio import ContainerClient
911
from PIL import Image
1012
from typing_extensions import Literal, Required, TypedDict
@@ -22,12 +24,17 @@ class ImageURL(TypedDict, total=False):
2224

2325
async def download_blob_as_base64(blob_container_client: ContainerClient, file_path: str) -> Optional[str]:
2426
base_name, _ = os.path.splitext(file_path)
25-
blob = await blob_container_client.get_blob_client(base_name + ".png").download_blob()
26-
27-
if not blob.properties:
27+
image_filename = base_name + ".png"
28+
try:
29+
blob = await blob_container_client.get_blob_client(image_filename).download_blob()
30+
if not blob.properties:
31+
logging.warning(f"No blob exists for {image_filename}")
32+
return None
33+
img = base64.b64encode(await blob.readall()).decode("utf-8")
34+
return f"data:image/png;base64,{img}"
35+
except ResourceNotFoundError:
36+
logging.warning(f"No blob exists for {image_filename}")
2837
return None
29-
img = base64.b64encode(await blob.readall()).decode("utf-8")
30-
return f"data:image/png;base64,{img}"
3138

3239

3340
async def fetch_image(blob_container_client: ContainerClient, result: Document) -> Optional[ImageURL]:

scripts/prepdocs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,15 @@ def setup_list_file_strategy(
101101
if datalake_filesystem is None or datalake_path is None:
102102
raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2")
103103
adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key
104-
logger.info(f"Using Data Lake Gen2 Storage Account {datalake_storage_account}")
104+
logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account)
105105
list_file_strategy = ADLSGen2ListFileStrategy(
106106
data_lake_storage_account=datalake_storage_account,
107107
data_lake_filesystem=datalake_filesystem,
108108
data_lake_path=datalake_path,
109109
credential=adls_gen2_creds,
110110
)
111111
elif local_files:
112-
logger.info(f"Using local files in {local_files}")
112+
logger.info("Using local files: %s", local_files)
113113
list_file_strategy = LocalListFileStrategy(path_pattern=local_files)
114114
else:
115115
raise ValueError("Either local_files or datalake_storage_account must be provided.")

scripts/prepdocslib/blobmanager.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,14 @@ async def upload_blob(self, file: File) -> Optional[List[str]]:
5555
# Re-open and upload the original file
5656
with open(file.content.name, "rb") as reopened_file:
5757
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
58-
logger.info(f"\tUploading blob for whole file -> {blob_name}")
58+
logger.info("Uploading blob for whole file -> %s", blob_name)
5959
await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
6060

61-
if self.store_page_images and os.path.splitext(file.content.name)[1].lower() == ".pdf":
62-
return await self.upload_pdf_blob_images(service_client, container_client, file)
61+
if self.store_page_images:
62+
if os.path.splitext(file.content.name)[1].lower() == ".pdf":
63+
return await self.upload_pdf_blob_images(service_client, container_client, file)
64+
else:
65+
logger.info("File %s is not a PDF, skipping image upload", file.content.name)
6366

6467
return None
6568

@@ -84,11 +87,11 @@ async def upload_pdf_blob_images(
8487
try:
8588
font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 20)
8689
except OSError:
87-
logger.info("\tUnable to find arial.ttf or FreeMono.ttf, using default font")
90+
logger.info("Unable to find arial.ttf or FreeMono.ttf, using default font")
8891

8992
for i in range(page_count):
9093
blob_name = BlobManager.blob_image_name_from_file_page(file.content.name, i)
91-
logger.info(f"\tConverting page {i} to image and uploading -> {blob_name}")
94+
logger.info("Converting page %s to image and uploading -> %s", i, blob_name)
9295

9396
doc = fitz.open(file.content.name)
9497
page = doc.load_page(i)
@@ -154,7 +157,7 @@ async def remove_blob(self, path: Optional[str] = None):
154157
)
155158
) or (path is not None and blob_path == os.path.basename(path)):
156159
continue
157-
logger.info(f"\tRemoving blob {blob_path}")
160+
logger.info("Removing blob %s", blob_path)
158161
await container_client.delete_blob(blob_path)
159162

160163
@classmethod

scripts/prepdocslib/embeddings.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,11 @@ async def create_embedding_batch(self, texts: List[str]) -> List[List[float]]:
9797
with attempt:
9898
emb_response = await client.embeddings.create(model=self.open_ai_model_name, input=batch.texts)
9999
embeddings.extend([data.embedding for data in emb_response.data])
100-
logger.info(f"Batch Completed. Batch size {len(batch.texts)} Token count {batch.token_length}")
100+
logger.info(
101+
"Computed embeddings in batch. Batch size: %d, Token count: %d",
102+
len(batch.texts),
103+
batch.token_length,
104+
)
101105

102106
return embeddings
103107

@@ -111,6 +115,7 @@ async def create_embedding_single(self, text: str) -> List[float]:
111115
):
112116
with attempt:
113117
emb_response = await client.embeddings.create(model=self.open_ai_model_name, input=text)
118+
logger.info("Computed embedding for text section. Character count: %d", len(text))
114119

115120
return emb_response.data[0].embedding
116121

scripts/prepdocslib/filestrategy.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,21 @@
1212

1313

1414
async def parse_file(
15-
file: File, file_processors: dict[str, FileProcessor], category: Optional[str] = None
15+
file: File,
16+
file_processors: dict[str, FileProcessor],
17+
category: Optional[str] = None,
18+
image_embeddings: Optional[ImageEmbeddings] = None,
1619
) -> List[Section]:
1720
key = file.file_extension()
1821
processor = file_processors.get(key)
1922
if processor is None:
20-
logger.info(f"Skipping '{file.filename()}', no parser found.")
23+
logger.info("Skipping '%s', no parser found.", file.filename())
2124
return []
22-
logger.info(f"Parsing '{file.filename()}'")
25+
logger.info("Ingesting '%s'", file.filename())
2326
pages = [page async for page in processor.parser.parse(content=file.content)]
24-
logger.info(f"Splitting '{file.filename()}' into sections")
27+
logger.info("Splitting '%s' into sections", file.filename())
28+
if image_embeddings:
29+
logger.warning("Each page will be split into smaller chunks of text, but images will be of the entire page.")
2530
sections = [
2631
Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
2732
]
@@ -76,7 +81,7 @@ async def run(self):
7681
files = self.list_file_strategy.list()
7782
async for file in files:
7883
try:
79-
sections = await parse_file(file, self.file_processors, self.category)
84+
sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
8085
if sections:
8186
blob_sas_uris = await self.blob_manager.upload_blob(file)
8287
blob_image_embeddings: Optional[List[List[float]]] = None

scripts/prepdocslib/htmlparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
3838
Returns:
3939
Page: The parsed html Page.
4040
"""
41-
logger.info(f"\tExtracting text from '{content.name}' using local HTML parser (BeautifulSoup)")
41+
logger.info("Extracting text from '%s' using local HTML parser (BeautifulSoup)", content.name)
4242

4343
data = content.read()
4444
soup = BeautifulSoup(data, "html.parser")

scripts/prepdocslib/listfilestrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def check_md5(self, path: str) -> bool:
100100
stored_hash = md5_f.read()
101101

102102
if stored_hash and stored_hash.strip() == existing_hash.strip():
103-
logger.info(f"Skipping {path}, no changes detected.")
103+
logger.info("Skipping %s, no changes detected.", path)
104104
return True
105105

106106
# Write the hash

scripts/prepdocslib/pdfparser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class LocalPdfParser(Parser):
2121
"""
2222

2323
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
24-
logger.info(f"\tExtracting text from '{content.name}' using local PDF parser (pypdf)")
24+
logger.info("Extracting text from '%s' using local PDF parser (pypdf)", content.name)
2525

2626
reader = PdfReader(content)
2727
pages = reader.pages
@@ -46,7 +46,7 @@ def __init__(
4646
self.credential = credential
4747

4848
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
49-
logger.info(f"Extracting text from '{content.name}' using Azure Document Intelligence")
49+
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
5050

5151
async with DocumentIntelligenceClient(
5252
endpoint=self.endpoint, credential=self.credential

scripts/prepdocslib/searchmanager.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(
6363
self.search_images = search_images
6464

6565
async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]] = None):
66-
logger.info(f"Ensuring search index {self.search_info.index_name} exists")
66+
logger.info("Ensuring search index %s exists", self.search_info.index_name)
6767

6868
async with self.search_info.create_search_index_client() as search_index_client:
6969
fields = [
@@ -175,10 +175,10 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
175175
),
176176
)
177177
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
178-
logger.info(f"Creating {self.search_info.index_name} search index")
178+
logger.info("Creating %s search index", self.search_info.index_name)
179179
await search_index_client.create_index(index)
180180
else:
181-
logger.info(f"Search index {self.search_info.index_name} already exists")
181+
logger.info("Search index %s already exists", self.search_info.index_name)
182182

183183
async def update_content(self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None):
184184
MAX_BATCH_SIZE = 1000
@@ -220,7 +220,9 @@ async def update_content(self, sections: List[Section], image_embeddings: Option
220220
await search_client.upload_documents(documents)
221221

222222
async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None):
223-
logger.info(f"Removing sections from '{path or '<all>'}' from search index '{self.search_info.index_name}'")
223+
logger.info(
224+
"Removing sections from '{%s or '<all>'}' from search index '%s'", path, self.search_info.index_name
225+
)
224226
async with self.search_info.create_search_client() as search_client:
225227
while True:
226228
filter = None if path is None else f"sourcefile eq '{os.path.basename(path)}'"
@@ -233,6 +235,6 @@ async def remove_content(self, path: Optional[str] = None, only_oid: Optional[st
233235
if not only_oid or document["oids"] == [only_oid]:
234236
documents_to_remove.append({"id": document["id"]})
235237
removed_docs = await search_client.delete_documents(documents_to_remove)
236-
logger.info(f"\tRemoved {len(removed_docs)} sections from index")
238+
logger.info("Removed %d sections from index", len(removed_docs))
237239
# It can take a few seconds for search results to reflect changes, so wait a bit
238240
await asyncio.sleep(2)

scripts/prepdocslib/textsplitter.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,6 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
133133
yield from self.split_page_by_max_tokens(page_num, second_half)
134134

135135
def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
136-
# Chunking is disabled when using GPT4V. To be updated in the future.
137-
if self.has_image_embeddings:
138-
for i, page in enumerate(pages):
139-
yield SplitPage(page_num=i, text=page.text)
140-
141136
def find_page(offset):
142137
num_pages = len(pages)
143138
for i in range(num_pages - 1):

0 commit comments

Comments
 (0)