@@ -225,7 +225,7 @@ async function parseDocument(
225225 if ( isPDF && ( hasAzureMistralOCR || hasMistralOCR ) ) {
226226 if ( hasAzureMistralOCR ) {
227227 logger . info ( `Using Azure Mistral OCR: ${ filename } ` )
228- return parseWithAzureMistralOCR ( fileUrl , filename , mimeType , userId , workspaceId )
228+ return parseWithAzureMistralOCR ( fileUrl , filename , mimeType )
229229 }
230230
231231 if ( hasMistralOCR ) {
@@ -399,13 +399,7 @@ async function makeOCRRequest(
399399 }
400400}
401401
402- async function parseWithAzureMistralOCR (
403- fileUrl : string ,
404- filename : string ,
405- mimeType : string ,
406- userId ?: string ,
407- workspaceId ?: string | null
408- ) {
402+ async function parseWithAzureMistralOCR ( fileUrl : string , filename : string , mimeType : string ) {
409403 validateOCRConfig (
410404 env . OCR_AZURE_API_KEY ,
411405 env . OCR_AZURE_ENDPOINT ,
@@ -509,7 +503,7 @@ async function parseWithMistralOCR(
509503 logger . info (
510504 `PDF has ${ pageCount } pages, exceeds limit of ${ MISTRAL_MAX_PAGES } . Splitting and processing in chunks.`
511505 )
512- return processMistralOCRInBatches ( filename , apiKey , buffer , userId , cloudUrl , fileUrl , mimeType )
506+ return processMistralOCRInBatches ( filename , apiKey , buffer , userId , cloudUrl )
513507 }
514508
515509 const params = { filePath : httpsUrl , apiKey, resultType : 'text' as const }
@@ -662,12 +656,10 @@ async function processMistralOCRInBatches(
662656 apiKey : string ,
663657 pdfBuffer : Buffer ,
664658 userId ?: string ,
665- cloudUrl ?: string ,
666- fileUrl ?: string ,
667- mimeType ?: string
659+ cloudUrl ?: string
668660) : Promise < {
669661 content : string
670- processingMethod : 'mistral-ocr' | 'file-parser'
662+ processingMethod : 'mistral-ocr'
671663 cloudUrl ?: string
672664} > {
673665 const totalPages = await getPdfPageCount ( pdfBuffer )
@@ -706,11 +698,12 @@ async function processMistralOCRInBatches(
706698 . map ( ( r ) => r . content as string )
707699
708700 if ( sortedResults . length === 0 ) {
709- logger . error ( `All OCR chunks failed for ${ filename } , falling back to file parser` )
710- if ( fileUrl && mimeType ) {
711- return parseWithFileParser ( fileUrl , filename , mimeType )
712- }
713- throw new Error ( 'All OCR chunks failed and no fallback available' )
701+ // Don't fall back to file parser for large PDFs - it produces poor results
702+ // Better to fail clearly than return low-quality extraction
703+ throw new Error (
704+ `OCR failed for all ${ pdfChunks . length } chunks of ${ filename } . ` +
705+ `Large PDFs require OCR - file parser fallback would produce poor results.`
706+ )
714707 }
715708
716709 const combinedContent = sortedResults . join ( '\n\n' )
0 commit comments