Skip to content

Commit ad31bb0

Browse files
committed
Improve logging output
1 parent 44c912b commit ad31bb0

File tree

10 files changed

+18
-22
lines changed

10 files changed

+18
-22
lines changed

app/backend/prepdocs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
315315
"--concurrency",
316316
type=int,
317317
default=FileStrategy.DEFAULT_CONCURRENCY,
318-
help="Max. number of concurrent tasks to run for processing files (file strategy only) (default: 10)",
318+
help="Max. number of concurrent tasks to run for processing files (file strategy only) (default: 4)",
319319
)
320320

321321
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")

app/backend/prepdocslib/blobmanager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
5656
if file.url is None:
5757
with open(file.content.name, "rb") as reopened_file:
5858
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
59-
logger.info("Uploading blob for whole file -> %s", blob_name)
59+
logger.info("'%s': Uploading blob for whole file", file.content.name)
6060
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
6161
file.url = blob_client.url
6262

6363
if self.store_page_images:
6464
if os.path.splitext(file.content.name)[1].lower() == ".pdf":
6565
return await self.upload_pdf_blob_images(service_client, container_client, file)
6666
else:
67-
logger.info("File %s is not a PDF, skipping image upload", file.content.name)
67+
logger.info("'%s': File is not a PDF, skipping image upload", file.content.name)
6868

6969
return None
7070

app/backend/prepdocslib/embeddings.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,6 @@ async def create_embedding_batch(self, texts: list[str], dimensions_args: ExtraA
114114
model=self.open_ai_model_name, input=batch.texts, **dimensions_args
115115
)
116116
embeddings.extend([data.embedding for data in emb_response.data])
117-
logger.info(
118-
"Computed embeddings in batch. Batch size: %d, Token count: %d",
119-
len(batch.texts),
120-
batch.token_length,
121-
)
122117

123118
return embeddings
124119

@@ -134,7 +129,6 @@ async def create_embedding_single(self, text: str, dimensions_args: ExtraArgs) -
134129
emb_response = await client.embeddings.create(
135130
model=self.open_ai_model_name, input=text, **dimensions_args
136131
)
137-
logger.info("Computed embedding for text section. Character count: %d", len(text))
138132

139133
return emb_response.data[0].embedding
140134

app/backend/prepdocslib/filestrategy.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ async def parse_file(
2424
key = file.file_extension().lower()
2525
processor = file_processors.get(key)
2626
if processor is None:
27-
logger.info("Skipping '%s', no parser found.", file.filename())
27+
logger.info("'%s': Skipping, no parser found.", file.content.name)
2828
return []
29-
logger.info("Ingesting '%s'", file.filename())
29+
logger.info("'%s': Starting ingestion process", file.content.name)
3030
pages = [page async for page in processor.parser.parse(content=file.content)]
31-
logger.info("Splitting '%s' into sections", file.filename())
31+
logger.info("'%s': Splitting into sections", file.content.name)
3232
if image_embeddings:
3333
logger.warning("Each page will be split into smaller chunks of text, but images will be of the entire page.")
3434
sections = [
@@ -113,9 +113,11 @@ async def process_file_worker(semaphore: asyncio.Semaphore, file: File):
113113
blob_image_embeddings: Optional[list[list[float]]] = None
114114
if self.image_embeddings and blob_sas_uris:
115115
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
116+
logger.info("'%s': Computing embeddings and updating search index", file.content.name)
116117
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
117118
finally:
118119
if file:
120+
logger.info("'%s': Finished processing file", file.content.name)
119121
file.close()
120122

121123
if self.document_action == DocumentAction.Add:

app/backend/prepdocslib/htmlparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
3939
Returns:
4040
Page: The parsed html Page.
4141
"""
42-
logger.info("Extracting text from '%s' using local HTML parser (BeautifulSoup)", content.name)
42+
logger.info("'%s': Extracting text using local HTML parser (BeautifulSoup)", content.name)
4343

4444
data = content.read()
4545
soup = BeautifulSoup(data, "html.parser")

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
129129
return skillset
130130

131131
async def setup(self):
132-
logger.info("Setting up search index using integrated vectorization...")
132+
logger.info("Setting up search index using integrated vectorization")
133133
search_manager = SearchManager(
134134
search_info=self.search_info,
135135
search_analyzer_name=self.search_analyzer_name,

app/backend/prepdocslib/listfilestrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def check_md5(self, path: str) -> bool:
102102
stored_hash = md5_f.read()
103103

104104
if stored_hash and stored_hash.strip() == existing_hash.strip():
105-
logger.info("Skipping %s, no changes detected.", path)
105+
logger.info("'%s': Skipping, no changes detected.", path)
106106
return True
107107

108108
# Write the hash

app/backend/prepdocslib/mediadescriber.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ async def poll():
5858
return await poll()
5959

6060
async def create_analyzer(self):
61-
logger.info("Creating analyzer '%s'...", self.analyzer_schema["analyzerId"])
61+
logger.info("Creating analyzer '%s'", self.analyzer_schema["analyzerId"])
6262

6363
token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
6464
token = await token_provider()
@@ -84,7 +84,7 @@ async def create_analyzer(self):
8484
await self.poll_api(session, poll_url, headers)
8585

8686
async def describe_image(self, image_bytes: bytes) -> str:
87-
logger.info("Sending image to Azure Content Understanding service...")
87+
logger.info("Sending image to Azure Content Understanding service")
8888
async with aiohttp.ClientSession() as session:
8989
token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
9090
headers = {"Authorization": "Bearer " + token.token}

app/backend/prepdocslib/pdfparser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class LocalPdfParser(Parser):
3333
"""
3434

3535
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
36-
logger.info("Extracting text from '%s' using local PDF parser (pypdf)", content.name)
36+
logger.info("'%s': Extracting text using local PDF parser (pypdf)", content.name)
3737

3838
reader = PdfReader(content)
3939
pages = reader.pages
@@ -65,7 +65,7 @@ def __init__(
6565
self.content_understanding_endpoint = content_understanding_endpoint
6666

6767
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
68-
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
68+
logger.info("'%s': Extracting text using Azure Document Intelligence", content.name)
6969

7070
async with DocumentIntelligenceClient(
7171
endpoint=self.endpoint, credential=self.credential

app/backend/prepdocslib/searchmanager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def __init__(
7777
self.search_images = search_images
7878

7979
async def create_index(self):
80-
logger.info("Checking whether search index %s exists...", self.search_info.index_name)
80+
logger.info("Checking whether search index '%s' exists", self.search_info.index_name)
8181

8282
async with self.search_info.create_search_index_client() as search_index_client:
8383

@@ -280,10 +280,10 @@ async def create_index(self):
280280

281281
await search_index_client.create_index(index)
282282
else:
283-
logger.info("Search index %s already exists", self.search_info.index_name)
283+
logger.info("Search index '%s' already exists", self.search_info.index_name)
284284
existing_index = await search_index_client.get_index(self.search_info.index_name)
285285
if not any(field.name == "storageUrl" for field in existing_index.fields):
286-
logger.info("Adding storageUrl field to index %s", self.search_info.index_name)
286+
logger.info("Adding storageUrl field to index '%s'", self.search_info.index_name)
287287
existing_index.fields.append(
288288
SimpleField(
289289
name="storageUrl",

0 commit comments

Comments
 (0)