|
12 | 12 | from openai import AsyncOpenAI |
13 | 13 |
|
14 | 14 | from .blobmanager import BlobManager |
| 15 | +from .csvparser import CsvParser |
15 | 16 | from .embeddings import ImageEmbeddings, OpenAIEmbeddings |
16 | 17 | from .figureprocessor import FigureProcessor, MediaDescriptionStrategy |
| 18 | +from .fileprocessor import FileProcessor |
17 | 19 | from .htmlparser import LocalHTMLParser |
| 20 | +from .jsonparser import JsonParser |
18 | 21 | from .parser import Parser |
19 | 22 | from .pdfparser import DocumentAnalysisParser, LocalPdfParser |
20 | 23 | from .strategy import SearchInfo |
21 | 24 | from .textparser import TextParser |
| 25 | +from .textsplitter import SentenceTextSplitter, SimpleTextSplitter |
22 | 26 |
|
23 | 27 | logger = logging.getLogger("scripts") |
24 | 28 |
|
@@ -241,77 +245,92 @@ def setup_figure_processor( |
241 | 245 | return None |
242 | 246 |
|
243 | 247 |
|
244 | | -def select_parser( |
| 248 | +def build_file_processors( |
245 | 249 | *, |
246 | | - file_name: str, |
247 | | - content_type: str, |
248 | 250 | azure_credential: AsyncTokenCredential, |
249 | 251 | document_intelligence_service: str | None, |
250 | 252 | document_intelligence_key: str | None = None, |
251 | | - process_figures: bool = False, |
252 | 253 | use_local_pdf_parser: bool = False, |
253 | 254 | use_local_html_parser: bool = False, |
254 | | -) -> Parser: |
255 | | - """Return a parser instance appropriate for the file type and configuration. |
256 | | -
|
257 | | - Args: |
258 | | - file_name: Source filename (used to derive extension) |
259 | | - content_type: MIME type (fallback for extension-based selection) |
260 | | - azure_credential: Token credential for DI service |
261 | | - document_intelligence_service: Name of DI service (None disables DI) |
262 | | - document_intelligence_key: Optional key credential (overrides token when provided) |
263 | | - process_figures: Whether figure extraction should be enabled in DI parser |
264 | | - use_local_pdf_parser: Force local PDF parsing instead of DI |
265 | | - use_local_html_parser: Force local HTML parsing instead of DI |
266 | | -
|
267 | | - Returns: |
268 | | - Parser capable of yielding Page objects for the document. |
269 | | -
|
270 | | - Raises: |
271 | | - ValueError: Unsupported file type or missing DI configuration for required formats. |
272 | | - """ |
273 | | - extension = file_name.lower().rsplit(".", 1)[-1] if "." in file_name else "" |
274 | | - ext_with_dot = f".{extension}" if extension else "" |
| 255 | + process_figures: bool = False, |
| 256 | +) -> dict[str, FileProcessor]: |
| 257 | + sentence_text_splitter = SentenceTextSplitter() |
275 | 258 |
|
276 | | - # Build DI parser lazily only if needed |
277 | | - di_parser: DocumentAnalysisParser | None = None |
| 259 | + doc_int_parser: Optional[DocumentAnalysisParser] = None |
| 260 | + # check if Azure Document Intelligence credentials are provided |
278 | 261 | if document_intelligence_service: |
279 | 262 | credential: AsyncTokenCredential | AzureKeyCredential |
280 | 263 | if document_intelligence_key: |
281 | 264 | credential = AzureKeyCredential(document_intelligence_key) |
282 | 265 | else: |
283 | 266 | credential = azure_credential |
284 | | - di_parser = DocumentAnalysisParser( |
| 267 | + doc_int_parser = DocumentAnalysisParser( |
285 | 268 | endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", |
286 | 269 | credential=credential, |
287 | 270 | process_figures=process_figures, |
288 | 271 | ) |
289 | 272 |
|
290 | | - # Plain text / structured text formats always local |
291 | | - if ext_with_dot in {".txt", ".md", ".csv", ".json"} or content_type.startswith("text/plain"): |
292 | | - return TextParser() |
293 | | - |
294 | | - # HTML |
295 | | - if ext_with_dot in {".html", ".htm"} or content_type in {"text/html", "application/html"}: |
296 | | - if use_local_html_parser or not di_parser: |
297 | | - return LocalHTMLParser() |
298 | | - return di_parser |
299 | | - |
300 | | - # PDF |
301 | | - if ext_with_dot == ".pdf": |
302 | | - if use_local_pdf_parser or not di_parser: |
303 | | - return LocalPdfParser() |
304 | | - return di_parser |
305 | | - |
306 | | - # Formats requiring DI |
307 | | - di_required_exts = {".docx", ".pptx", ".xlsx", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".heic"} |
308 | | - if ext_with_dot in di_required_exts: |
309 | | - if not di_parser: |
310 | | - raise ValueError("Document Intelligence service must be configured to process this file type") |
311 | | - return di_parser |
312 | | - |
313 | | - # Fallback: if MIME suggests application/* and DI available, use DI |
314 | | - if content_type.startswith("application/") and di_parser: |
315 | | - return di_parser |
316 | | - |
317 | | - raise ValueError(f"Unsupported file type: {file_name}") |
| 273 | + pdf_parser: Optional[Parser] = None |
| 274 | + if use_local_pdf_parser or document_intelligence_service is None: |
| 275 | + pdf_parser = LocalPdfParser() |
| 276 | + elif document_intelligence_service is not None: |
| 277 | + pdf_parser = doc_int_parser |
| 278 | + else: |
| 279 | + logger.warning("No PDF parser available") |
| 280 | + |
| 281 | + html_parser: Optional[Parser] = None |
| 282 | + if use_local_html_parser or document_intelligence_service is None: |
| 283 | + html_parser = LocalHTMLParser() |
| 284 | + elif document_intelligence_service is not None: |
| 285 | + html_parser = doc_int_parser |
| 286 | + else: |
| 287 | + logger.warning("No HTML parser available") |
| 288 | + |
| 289 | + # These file formats can always be parsed: |
| 290 | + file_processors = { |
| 291 | + ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), |
| 292 | + ".md": FileProcessor(TextParser(), sentence_text_splitter), |
| 293 | + ".txt": FileProcessor(TextParser(), sentence_text_splitter), |
| 294 | + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), |
| 295 | + } |
| 296 | + # These require either a Python package or Document Intelligence |
| 297 | + if pdf_parser is not None: |
| 298 | + file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) |
| 299 | + if html_parser is not None: |
| 300 | + file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) |
| 301 | + # These file formats require Document Intelligence |
| 302 | + if doc_int_parser is not None: |
| 303 | + file_processors.update( |
| 304 | + { |
| 305 | + ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 306 | + ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 307 | + ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 308 | + ".png": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 309 | + ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 310 | + ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 311 | + ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 312 | + ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 313 | + ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), |
| 314 | + } |
| 315 | + ) |
| 316 | + return file_processors |
| 317 | + |
| 318 | + |
| 319 | +def select_processor_for_filename(file_name: str, file_processors: dict[str, FileProcessor]) -> FileProcessor: |
| 320 | + """Select the appropriate file processor for a given filename. |
| 321 | +
|
| 322 | + Args: |
| 323 | + file_name: Name of the file to process |
| 324 | + file_processors: Dictionary mapping file extensions to FileProcessor instances |
| 325 | +
|
| 326 | + Returns: |
| 327 | + FileProcessor instance for the file |
| 328 | +
|
| 329 | + Raises: |
| 330 | + ValueError: If the file extension is not supported |
| 331 | + """ |
| 332 | + file_ext = os.path.splitext(file_name)[1].lower() |
| 333 | + file_processor = file_processors.get(file_ext) |
| 334 | + if not file_processor: |
| 335 | + raise ValueError(f"Unsupported file type: {file_name}") |
| 336 | + return file_processor |
0 commit comments