Skip to content

Commit b54036c

Browse files
committed
comments cleanup
1 parent 307d7ab commit b54036c

File tree

1 file changed

+5
-14
lines changed

1 file changed

+5
-14
lines changed

apps/sim/lib/knowledge/documents/document-processor.ts

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ const TIMEOUTS = {
1717
MISTRAL_OCR_API: 120000,
1818
} as const
1919

20+
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
21+
2022
type OCRResult = {
2123
success: boolean
2224
error?: string
@@ -245,10 +247,6 @@ async function handleFileForOCR(
245247
) {
246248
const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
247249

248-
logger.info(
249-
`handleFileForOCR: fileUrl=${fileUrl.substring(0, 100)}..., isExternalHttps=${isExternalHttps}`
250-
)
251-
252250
if (isExternalHttps) {
253251
if (mimeType === 'application/pdf') {
254252
logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
@@ -417,16 +415,14 @@ async function parseWithAzureMistralOCR(
417415

418416
const fileBuffer = await downloadFileForBase64(fileUrl)
419417

420-
// Check page count for PDFs - Azure OCR with base64 doesn't support efficient batching
421-
// so we skip it for large files and let regular Mistral OCR handle them
422418
if (mimeType === 'application/pdf') {
423419
const pageCount = await getPdfPageCount(fileBuffer)
424420
if (pageCount > MISTRAL_MAX_PAGES) {
425421
logger.info(
426-
`PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. ` +
427-
`Skipping Azure OCR (base64 doesn't batch efficiently), will use regular Mistral OCR.`
422+
`PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
423+
`Falling back to file parser.`
428424
)
429-
throw new Error(`PDF too large for Azure OCR: ${pageCount} pages`)
425+
return parseWithFileParser(fileUrl, filename, mimeType)
430426
}
431427
logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
432428
}
@@ -501,7 +497,6 @@ async function parseWithMistralOCR(
501497

502498
logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`)
503499

504-
// Check page count for PDFs and batch if necessary
505500
let pageCount = 0
506501
if (mimeType === 'application/pdf' && buffer) {
507502
pageCount = await getPdfPageCount(buffer)
@@ -517,7 +512,6 @@ async function parseWithMistralOCR(
517512
return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl, fileUrl, mimeType)
518513
}
519514

520-
// Single request for smaller PDFs
521515
const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }
522516

523517
try {
@@ -663,9 +657,6 @@ async function processChunk(
663657
}
664658
}
665659

666-
// Maximum concurrent chunk processing to avoid overwhelming APIs
667-
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
668-
669660
async function processMistralOCRInBatches(
670661
filename: string,
671662
apiKey: string,

0 commit comments

Comments
 (0)