Skip to content

Commit 3bde9e8

Browse files
authored
fix(mistral): remove wrapped output from mistral parse for kb parsing pdfs (#2326)
1 parent 31b795f commit 3bde9e8

File tree

1 file changed

+22
-29
lines changed

1 file changed

+22
-29
lines changed

apps/sim/tools/mistral/parser.ts

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,11 @@ export const mistralParserTool: ToolConfig<MistralParserInput, MistralParserOutp
258258
throw new Error('Invalid response format from Mistral OCR API')
259259
}
260260

261-
// Set default values and extract from params if available
261+
const mistralData =
262+
ocrResult.output && typeof ocrResult.output === 'object' && !ocrResult.pages
263+
? ocrResult.output
264+
: ocrResult
265+
262266
let resultType: 'markdown' | 'text' | 'json' = 'markdown'
263267
let sourceUrl = ''
264268
let isFileUpload = false
@@ -268,50 +272,44 @@ export const mistralParserTool: ToolConfig<MistralParserInput, MistralParserOutp
268272
sourceUrl = params.filePath.trim()
269273
}
270274

271-
// Check if this was a file upload
272275
isFileUpload = !!params.fileUpload
273276

274277
if (params.resultType && ['markdown', 'text', 'json'].includes(params.resultType)) {
275278
resultType = params.resultType as 'markdown' | 'text' | 'json'
276279
}
277280
} else if (
278-
ocrResult.document &&
279-
typeof ocrResult.document === 'object' &&
280-
ocrResult.document.document_url &&
281-
typeof ocrResult.document.document_url === 'string'
281+
mistralData.document &&
282+
typeof mistralData.document === 'object' &&
283+
mistralData.document.document_url &&
284+
typeof mistralData.document.document_url === 'string'
282285
) {
283-
sourceUrl = ocrResult.document.document_url
286+
sourceUrl = mistralData.document.document_url
284287
}
285288

286-
// Process content from pages
287289
let content = ''
288290
const pageCount =
289-
ocrResult.pages && Array.isArray(ocrResult.pages) ? ocrResult.pages.length : 0
291+
mistralData.pages && Array.isArray(mistralData.pages) ? mistralData.pages.length : 0
290292

291293
if (pageCount > 0) {
292-
content = ocrResult.pages
294+
content = mistralData.pages
293295
.map((page: any) => (page && typeof page.markdown === 'string' ? page.markdown : ''))
294296
.filter(Boolean)
295297
.join('\n\n')
296298
} else {
297299
logger.warn('No pages found in OCR result, returning raw response')
298-
content = JSON.stringify(ocrResult, null, 2)
300+
content = JSON.stringify(mistralData, null, 2)
299301
}
300302

301-
// Process based on requested result type
302303
if (resultType === 'text') {
303-
// Strip markdown formatting
304304
content = content
305305
.replace(/##*\s/g, '') // Remove markdown headers
306306
.replace(/\*\*/g, '') // Remove bold markers
307307
.replace(/\*/g, '') // Remove italic markers
308308
.replace(/\n{3,}/g, '\n\n') // Normalize newlines
309309
} else if (resultType === 'json') {
310-
// Return the structured data as JSON string
311-
content = JSON.stringify(ocrResult, null, 2)
310+
content = JSON.stringify(mistralData, null, 2)
312311
}
313312

314-
// Extract file information with proper validation
315313
let fileName = 'document.pdf'
316314
let fileType = 'pdf'
317315

@@ -333,40 +331,36 @@ export const mistralParserTool: ToolConfig<MistralParserInput, MistralParserOutp
333331
}
334332
}
335333

336-
// Generate a tracking ID with timestamp and random component for uniqueness
337334
const timestamp = Date.now()
338335
const randomId = Math.random().toString(36).substring(2, 10)
339336
const jobId = `mistral-ocr-${timestamp}-${randomId}`
340337

341-
// Map API response fields to our schema with proper type checking
342338
const usageInfo =
343-
ocrResult.usage_info && typeof ocrResult.usage_info === 'object'
339+
mistralData.usage_info && typeof mistralData.usage_info === 'object'
344340
? {
345341
pagesProcessed:
346-
typeof ocrResult.usage_info.pages_processed === 'number'
347-
? ocrResult.usage_info.pages_processed
348-
: Number(ocrResult.usage_info.pages_processed),
342+
typeof mistralData.usage_info.pages_processed === 'number'
343+
? mistralData.usage_info.pages_processed
344+
: Number(mistralData.usage_info.pages_processed),
349345
docSizeBytes:
350-
typeof ocrResult.usage_info.doc_size_bytes === 'number'
351-
? ocrResult.usage_info.doc_size_bytes
352-
: Number(ocrResult.usage_info.doc_size_bytes),
346+
typeof mistralData.usage_info.doc_size_bytes === 'number'
347+
? mistralData.usage_info.doc_size_bytes
348+
: Number(mistralData.usage_info.doc_size_bytes),
353349
}
354350
: undefined
355351

356-
// Create metadata object
357352
const metadata: any = {
358353
jobId,
359354
fileType,
360355
fileName,
361356
source: 'url',
362357
pageCount,
363358
usageInfo,
364-
model: typeof ocrResult.model === 'string' ? ocrResult.model : 'mistral-ocr-latest',
359+
model: typeof mistralData.model === 'string' ? mistralData.model : 'mistral-ocr-latest',
365360
resultType,
366361
processedAt: new Date().toISOString(),
367362
}
368363

369-
// Only include sourceUrl for non-file-upload sources or URLs that don't contain our API endpoint
370364
if (
371365
!isFileUpload &&
372366
sourceUrl &&
@@ -376,7 +370,6 @@ export const mistralParserTool: ToolConfig<MistralParserInput, MistralParserOutp
376370
metadata.sourceUrl = sourceUrl
377371
}
378372

379-
// Return properly structured response
380373
const parserResponse: MistralParserOutput = {
381374
success: true,
382375
output: {

0 commit comments

Comments
 (0)