Skip to content

Commit 9fcaa55

Browse files
committed
More code cleanups
1 parent 6d4e490 commit 9fcaa55

File tree

6 files changed

+242
-199
lines changed

6 files changed

+242
-199
lines changed

app/backend/prepdocs.py

Lines changed: 14 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,19 @@
1111
from rich.logging import RichHandler
1212

1313
from load_azd_env import load_azd_env
14-
from prepdocslib.csvparser import CsvParser
15-
from prepdocslib.fileprocessor import FileProcessor
1614
from prepdocslib.filestrategy import FileStrategy
1715
from prepdocslib.integratedvectorizerstrategy import (
1816
IntegratedVectorizerStrategy,
1917
)
20-
from prepdocslib.jsonparser import JsonParser
2118
from prepdocslib.listfilestrategy import (
2219
ADLSGen2ListFileStrategy,
2320
ListFileStrategy,
2421
LocalListFileStrategy,
2522
)
26-
from prepdocslib.parser import Parser
2723
from prepdocslib.servicesetup import (
2824
OpenAIHost,
25+
build_file_processors,
2926
clean_key_if_exists,
30-
select_parser,
3127
setup_blob_manager,
3228
setup_embeddings_service,
3329
setup_figure_processor,
@@ -36,8 +32,6 @@
3632
setup_search_info,
3733
)
3834
from prepdocslib.strategy import DocumentAction, Strategy
39-
from prepdocslib.textparser import TextParser
40-
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
4135

4236
logger = logging.getLogger("scripts")
4337

@@ -100,61 +94,20 @@ def setup_file_processors(
10094
openai_deployment: Optional[str] = None,
10195
content_understanding_endpoint: Optional[str] = None,
10296
):
103-
sentence_text_splitter = SentenceTextSplitter()
97+
"""Setup file processors and figure processor for document ingestion.
98+
99+
Uses build_file_processors from servicesetup to ensure consistent parser/splitter
100+
selection logic with the Azure Functions cloud ingestion pipeline.
101+
"""
102+
file_processors = build_file_processors(
103+
azure_credential=azure_credential,
104+
document_intelligence_service=document_intelligence_service,
105+
document_intelligence_key=document_intelligence_key,
106+
use_local_pdf_parser=local_pdf_parser,
107+
use_local_html_parser=local_html_parser,
108+
process_figures=use_multimodal,
109+
)
104110

105-
# Build mapping of file extensions to parsers using shared select_parser helper.
106-
# Each select attempt may instantiate a DI parser; duplication is acceptable at startup.
107-
def _try_select(ext: str, content_type: str) -> Parser | None:
108-
file_name = f"dummy{ext}"
109-
try:
110-
return select_parser(
111-
file_name=file_name,
112-
content_type=content_type,
113-
azure_credential=azure_credential,
114-
document_intelligence_service=document_intelligence_service,
115-
document_intelligence_key=document_intelligence_key,
116-
process_figures=use_multimodal,
117-
use_local_pdf_parser=local_pdf_parser,
118-
use_local_html_parser=local_html_parser,
119-
)
120-
except ValueError:
121-
return None
122-
123-
pdf_parser: Parser | None = _try_select(".pdf", "application/pdf")
124-
html_parser: Parser | None = _try_select(".html", "text/html")
125-
126-
# DI-only formats
127-
di_exts = [
128-
".docx",
129-
".pptx",
130-
".xlsx",
131-
".png",
132-
".jpg",
133-
".jpeg",
134-
".tiff",
135-
".bmp",
136-
".heic",
137-
]
138-
di_parsers: dict[str, Parser] = {}
139-
for ext in di_exts:
140-
parser = _try_select(ext, "application/octet-stream")
141-
if parser is not None:
142-
di_parsers[ext] = parser
143-
144-
# These file formats can always be parsed:
145-
file_processors = {
146-
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
147-
".md": FileProcessor(TextParser(), sentence_text_splitter),
148-
".txt": FileProcessor(TextParser(), sentence_text_splitter),
149-
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
150-
}
151-
# These require either a Python package or Document Intelligence
152-
if pdf_parser is not None:
153-
file_processors[".pdf"] = FileProcessor(pdf_parser, sentence_text_splitter)
154-
if html_parser is not None:
155-
file_processors[".html"] = FileProcessor(html_parser, sentence_text_splitter)
156-
for ext, parser in di_parsers.items():
157-
file_processors[ext] = FileProcessor(parser, sentence_text_splitter)
158111
figure_processor = setup_figure_processor(
159112
credential=azure_credential,
160113
use_multimodal=use_multimodal,

app/backend/prepdocslib/servicesetup.py

Lines changed: 75 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
from openai import AsyncOpenAI
1313

1414
from .blobmanager import BlobManager
15+
from .csvparser import CsvParser
1516
from .embeddings import ImageEmbeddings, OpenAIEmbeddings
1617
from .figureprocessor import FigureProcessor, MediaDescriptionStrategy
18+
from .fileprocessor import FileProcessor
1719
from .htmlparser import LocalHTMLParser
20+
from .jsonparser import JsonParser
1821
from .parser import Parser
1922
from .pdfparser import DocumentAnalysisParser, LocalPdfParser
2023
from .strategy import SearchInfo
2124
from .textparser import TextParser
25+
from .textsplitter import SentenceTextSplitter, SimpleTextSplitter
2226

2327
logger = logging.getLogger("scripts")
2428

@@ -241,77 +245,92 @@ def setup_figure_processor(
241245
return None
242246

243247

244-
def select_parser(
248+
def build_file_processors(
245249
*,
246-
file_name: str,
247-
content_type: str,
248250
azure_credential: AsyncTokenCredential,
249251
document_intelligence_service: str | None,
250252
document_intelligence_key: str | None = None,
251-
process_figures: bool = False,
252253
use_local_pdf_parser: bool = False,
253254
use_local_html_parser: bool = False,
254-
) -> Parser:
255-
"""Return a parser instance appropriate for the file type and configuration.
256-
257-
Args:
258-
file_name: Source filename (used to derive extension)
259-
content_type: MIME type (fallback for extension-based selection)
260-
azure_credential: Token credential for DI service
261-
document_intelligence_service: Name of DI service (None disables DI)
262-
document_intelligence_key: Optional key credential (overrides token when provided)
263-
process_figures: Whether figure extraction should be enabled in DI parser
264-
use_local_pdf_parser: Force local PDF parsing instead of DI
265-
use_local_html_parser: Force local HTML parsing instead of DI
266-
267-
Returns:
268-
Parser capable of yielding Page objects for the document.
269-
270-
Raises:
271-
ValueError: Unsupported file type or missing DI configuration for required formats.
272-
"""
273-
extension = file_name.lower().rsplit(".", 1)[-1] if "." in file_name else ""
274-
ext_with_dot = f".{extension}" if extension else ""
255+
process_figures: bool = False,
256+
) -> dict[str, FileProcessor]:
257+
sentence_text_splitter = SentenceTextSplitter()
275258

276-
# Build DI parser lazily only if needed
277-
di_parser: DocumentAnalysisParser | None = None
259+
doc_int_parser: Optional[DocumentAnalysisParser] = None
260+
# check if Azure Document Intelligence credentials are provided
278261
if document_intelligence_service:
279262
credential: AsyncTokenCredential | AzureKeyCredential
280263
if document_intelligence_key:
281264
credential = AzureKeyCredential(document_intelligence_key)
282265
else:
283266
credential = azure_credential
284-
di_parser = DocumentAnalysisParser(
267+
doc_int_parser = DocumentAnalysisParser(
285268
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
286269
credential=credential,
287270
process_figures=process_figures,
288271
)
289272

290-
# Plain text / structured text formats always local
291-
if ext_with_dot in {".txt", ".md", ".csv", ".json"} or content_type.startswith("text/plain"):
292-
return TextParser()
293-
294-
# HTML
295-
if ext_with_dot in {".html", ".htm"} or content_type in {"text/html", "application/html"}:
296-
if use_local_html_parser or not di_parser:
297-
return LocalHTMLParser()
298-
return di_parser
299-
300-
# PDF
301-
if ext_with_dot == ".pdf":
302-
if use_local_pdf_parser or not di_parser:
303-
return LocalPdfParser()
304-
return di_parser
305-
306-
# Formats requiring DI
307-
di_required_exts = {".docx", ".pptx", ".xlsx", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".heic"}
308-
if ext_with_dot in di_required_exts:
309-
if not di_parser:
310-
raise ValueError("Document Intelligence service must be configured to process this file type")
311-
return di_parser
312-
313-
# Fallback: if MIME suggests application/* and DI available, use DI
314-
if content_type.startswith("application/") and di_parser:
315-
return di_parser
316-
317-
raise ValueError(f"Unsupported file type: {file_name}")
273+
pdf_parser: Optional[Parser] = None
274+
if use_local_pdf_parser or document_intelligence_service is None:
275+
pdf_parser = LocalPdfParser()
276+
elif document_intelligence_service is not None:
277+
pdf_parser = doc_int_parser
278+
else:
279+
logger.warning("No PDF parser available")
280+
281+
html_parser: Optional[Parser] = None
282+
if use_local_html_parser or document_intelligence_service is None:
283+
html_parser = LocalHTMLParser()
284+
elif document_intelligence_service is not None:
285+
html_parser = doc_int_parser
286+
else:
287+
logger.warning("No HTML parser available")
288+
289+
# These file formats can always be parsed:
290+
file_processors = {
291+
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
292+
".md": FileProcessor(TextParser(), sentence_text_splitter),
293+
".txt": FileProcessor(TextParser(), sentence_text_splitter),
294+
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
295+
}
296+
# These require either a Python package or Document Intelligence
297+
if pdf_parser is not None:
298+
file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
299+
if html_parser is not None:
300+
file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
301+
# These file formats require Document Intelligence
302+
if doc_int_parser is not None:
303+
file_processors.update(
304+
{
305+
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
306+
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
307+
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
308+
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
309+
".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
310+
".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
311+
".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
312+
".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
313+
".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
314+
}
315+
)
316+
return file_processors
317+
318+
319+
def select_processor_for_filename(file_name: str, file_processors: dict[str, FileProcessor]) -> FileProcessor:
320+
"""Select the appropriate file processor for a given filename.
321+
322+
Args:
323+
file_name: Name of the file to process
324+
file_processors: Dictionary mapping file extensions to FileProcessor instances
325+
326+
Returns:
327+
FileProcessor instance for the file
328+
329+
Raises:
330+
ValueError: If the file extension is not supported
331+
"""
332+
file_ext = os.path.splitext(file_name)[1].lower()
333+
file_processor = file_processors.get(file_ext)
334+
if not file_processor:
335+
raise ValueError(f"Unsupported file type: {file_name}")
336+
return file_processor

app/functions/document_extractor/function_app.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,12 @@
1515
from azure.core.exceptions import HttpResponseError
1616
from azure.identity.aio import ManagedIdentityCredential
1717

18+
from prepdocslib.fileprocessor import FileProcessor
1819
from prepdocslib.page import Page
19-
from prepdocslib.servicesetup import select_parser
20+
from prepdocslib.servicesetup import (
21+
build_file_processors,
22+
select_processor_for_filename,
23+
)
2024

2125
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
2226

@@ -25,10 +29,7 @@
2529

2630
@dataclass
2731
class GlobalSettings:
28-
use_local_pdf_parser: bool
29-
use_local_html_parser: bool
30-
use_multimodal: bool
31-
document_intelligence_service: str | None
32+
file_processors: dict[str, FileProcessor]
3233
azure_credential: ManagedIdentityCredential
3334

3435

@@ -52,11 +53,18 @@ def configure_global_settings():
5253
logger.info("Using default Managed Identity without client ID")
5354
azure_credential = ManagedIdentityCredential()
5455

55-
settings = GlobalSettings(
56+
# Build file processors dict for parser selection
57+
file_processors = build_file_processors(
58+
azure_credential=azure_credential,
59+
document_intelligence_service=document_intelligence_service,
60+
document_intelligence_key=None,
5661
use_local_pdf_parser=use_local_pdf_parser,
5762
use_local_html_parser=use_local_html_parser,
58-
use_multimodal=use_multimodal,
59-
document_intelligence_service=document_intelligence_service,
63+
process_figures=use_multimodal,
64+
)
65+
66+
settings = GlobalSettings(
67+
file_processors=file_processors,
6068
azure_credential=azure_credential,
6169
)
6270

@@ -176,16 +184,9 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]:
176184
document_stream, file_name, content_type = get_document_stream_filedata(data)
177185
logger.info("Processing document: %s", file_name)
178186

179-
parser = select_parser(
180-
file_name=file_name,
181-
content_type=content_type,
182-
azure_credential=settings.azure_credential,
183-
document_intelligence_service=settings.document_intelligence_service,
184-
document_intelligence_key=None,
185-
process_figures=settings.use_multimodal,
186-
use_local_pdf_parser=settings.use_local_pdf_parser,
187-
use_local_html_parser=settings.use_local_html_parser,
188-
)
187+
# Get parser from file_processors dict based on file extension
188+
file_processor = select_processor_for_filename(file_name, settings.file_processors)
189+
parser = file_processor.parser
189190

190191
pages: list[Page] = []
191192
try:

0 commit comments

Comments
 (0)