@@ -17,6 +17,8 @@ const TIMEOUTS = {
1717 MISTRAL_OCR_API : 120000 ,
1818} as const
1919
20+ const MAX_CONCURRENT_CHUNKS = env . KB_CONFIG_CHUNK_CONCURRENCY
21+
2022type OCRResult = {
2123 success : boolean
2224 error ?: string
@@ -245,10 +247,6 @@ async function handleFileForOCR(
245247) {
246248 const isExternalHttps = fileUrl . startsWith ( 'https://' ) && ! fileUrl . includes ( '/api/files/serve/' )
247249
248- logger . info (
249- `handleFileForOCR: fileUrl=${ fileUrl . substring ( 0 , 100 ) } ..., isExternalHttps=${ isExternalHttps } `
250- )
251-
252250 if ( isExternalHttps ) {
253251 if ( mimeType === 'application/pdf' ) {
254252 logger . info ( `handleFileForOCR: Downloading external PDF to check page count` )
@@ -417,16 +415,14 @@ async function parseWithAzureMistralOCR(
417415
418416 const fileBuffer = await downloadFileForBase64 ( fileUrl )
419417
420- // Check page count for PDFs - Azure OCR with base64 doesn't support efficient batching
421- // so we skip it for large files and let regular Mistral OCR handle them
422418 if ( mimeType === 'application/pdf' ) {
423419 const pageCount = await getPdfPageCount ( fileBuffer )
424420 if ( pageCount > MISTRAL_MAX_PAGES ) {
425421 logger . info (
426- `PDF has ${ pageCount } pages, exceeds limit of ${ MISTRAL_MAX_PAGES } . ` +
427- `Skipping Azure OCR (base64 doesn't batch efficiently), will use regular Mistral OCR .`
422+ `PDF has ${ pageCount } pages, exceeds Azure OCR limit of ${ MISTRAL_MAX_PAGES } . ` +
423+ `Falling back to file parser .`
428424 )
429- throw new Error ( `PDF too large for Azure OCR: ${ pageCount } pages` )
425+ return parseWithFileParser ( fileUrl , filename , mimeType )
430426 }
431427 logger . info ( `Azure Mistral OCR: PDF page count for ${ filename } : ${ pageCount } ` )
432428 }
@@ -501,7 +497,6 @@ async function parseWithMistralOCR(
501497
502498 logger . info ( `Mistral OCR: Using presigned URL for ${ filename } : ${ httpsUrl . substring ( 0 , 120 ) } ...` )
503499
504- // Check page count for PDFs and batch if necessary
505500 let pageCount = 0
506501 if ( mimeType === 'application/pdf' && buffer ) {
507502 pageCount = await getPdfPageCount ( buffer )
@@ -517,7 +512,6 @@ async function parseWithMistralOCR(
517512 return processMistralOCRInBatches ( filename , apiKey , buffer , userId , cloudUrl , fileUrl , mimeType )
518513 }
519514
520- // Single request for smaller PDFs
521515 const params = { filePath : httpsUrl , apiKey, resultType : 'text' as const }
522516
523517 try {
@@ -663,9 +657,6 @@ async function processChunk(
663657 }
664658}
665659
666- // Maximum concurrent chunk processing to avoid overwhelming APIs
667- const MAX_CONCURRENT_CHUNKS = env . KB_CONFIG_CHUNK_CONCURRENCY
668-
669660async function processMistralOCRInBatches (
670661 filename : string ,
671662 apiKey : string ,
0 commit comments