Skip to content

Commit c997912

Browse files
committed
ack PR comments
1 parent b54036c commit c997912

File tree

1 file changed

+11
-18
lines changed

1 file changed

+11
-18
lines changed

apps/sim/lib/knowledge/documents/document-processor.ts

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ async function parseDocument(
225225
if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
226226
if (hasAzureMistralOCR) {
227227
logger.info(`Using Azure Mistral OCR: ${filename}`)
228-
return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId)
228+
return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
229229
}
230230

231231
if (hasMistralOCR) {
@@ -399,13 +399,7 @@ async function makeOCRRequest(
399399
}
400400
}
401401

402-
async function parseWithAzureMistralOCR(
403-
fileUrl: string,
404-
filename: string,
405-
mimeType: string,
406-
userId?: string,
407-
workspaceId?: string | null
408-
) {
402+
async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
409403
validateOCRConfig(
410404
env.OCR_AZURE_API_KEY,
411405
env.OCR_AZURE_ENDPOINT,
@@ -509,7 +503,7 @@ async function parseWithMistralOCR(
509503
logger.info(
510504
`PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.`
511505
)
512-
return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl, fileUrl, mimeType)
506+
return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl)
513507
}
514508

515509
const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }
@@ -662,12 +656,10 @@ async function processMistralOCRInBatches(
662656
apiKey: string,
663657
pdfBuffer: Buffer,
664658
userId?: string,
665-
cloudUrl?: string,
666-
fileUrl?: string,
667-
mimeType?: string
659+
cloudUrl?: string
668660
): Promise<{
669661
content: string
670-
processingMethod: 'mistral-ocr' | 'file-parser'
662+
processingMethod: 'mistral-ocr'
671663
cloudUrl?: string
672664
}> {
673665
const totalPages = await getPdfPageCount(pdfBuffer)
@@ -706,11 +698,12 @@ async function processMistralOCRInBatches(
706698
.map((r) => r.content as string)
707699

708700
if (sortedResults.length === 0) {
709-
logger.error(`All OCR chunks failed for ${filename}, falling back to file parser`)
710-
if (fileUrl && mimeType) {
711-
return parseWithFileParser(fileUrl, filename, mimeType)
712-
}
713-
throw new Error('All OCR chunks failed and no fallback available')
701+
// Don't fall back to file parser for large PDFs - it produces poor results
702+
// Better to fail clearly than return low-quality extraction
703+
throw new Error(
704+
`OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
705+
`Large PDFs require OCR - file parser fallback would produce poor results.`
706+
)
714707
}
715708

716709
const combinedContent = sortedResults.join('\n\n')

0 commit comments

Comments
 (0)