Skip to content

Commit a969d09

Browse files
authored
feat(parsers): added pptx, md, & html parsers (#1202)
* feat(parsers): added pptx, md, & html parsers * ack PR comments * file renaming, reorganization
1 parent df3d532 commit a969d09

File tree

39 files changed

+897
-527
lines changed

39 files changed

+897
-527
lines changed

apps/sim/app/api/copilot/chat/route.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ import {
1212
import { getCopilotModel } from '@/lib/copilot/config'
1313
import type { CopilotProviderConfig } from '@/lib/copilot/types'
1414
import { env } from '@/lib/env'
15-
import { generateChatTitle } from '@/lib/generate-chat-title'
1615
import { createLogger } from '@/lib/logs/console/logger'
1716
import { SIM_AGENT_API_URL_DEFAULT } from '@/lib/sim-agent'
17+
import { generateChatTitle } from '@/lib/sim-agent/utils'
1818
import { createFileContent, isSupportedFileType } from '@/lib/uploads/file-utils'
1919
import { S3_COPILOT_CONFIG } from '@/lib/uploads/setup'
2020
import { downloadFile, getStorageProvider } from '@/lib/uploads/storage-client'

apps/sim/app/api/files/parse/route.ts

Lines changed: 16 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,9 @@ export async function POST(request: NextRequest) {
7676

7777
logger.info('File parse request received:', { filePath, fileType })
7878

79-
// Handle multiple files
8079
if (Array.isArray(filePath)) {
8180
const results = []
8281
for (const path of filePath) {
83-
// Skip empty or invalid paths
8482
if (!path || (typeof path === 'string' && path.trim() === '')) {
8583
results.push({
8684
success: false,
@@ -91,12 +89,10 @@ export async function POST(request: NextRequest) {
9189
}
9290

9391
const result = await parseFileSingle(path, fileType)
94-
// Add processing time to metadata
9592
if (result.metadata) {
9693
result.metadata.processingTime = Date.now() - startTime
9794
}
9895

99-
// Transform each result to match expected frontend format
10096
if (result.success) {
10197
results.push({
10298
success: true,
@@ -105,7 +101,7 @@ export async function POST(request: NextRequest) {
105101
name: result.filePath.split('/').pop() || 'unknown',
106102
fileType: result.metadata?.fileType || 'application/octet-stream',
107103
size: result.metadata?.size || 0,
108-
binary: false, // We only return text content
104+
binary: false,
109105
},
110106
filePath: result.filePath,
111107
})
@@ -120,15 +116,12 @@ export async function POST(request: NextRequest) {
120116
})
121117
}
122118

123-
// Handle single file
124119
const result = await parseFileSingle(filePath, fileType)
125120

126-
// Add processing time to metadata
127121
if (result.metadata) {
128122
result.metadata.processingTime = Date.now() - startTime
129123
}
130124

131-
// Transform single file result to match expected frontend format
132125
if (result.success) {
133126
return NextResponse.json({
134127
success: true,
@@ -142,8 +135,6 @@ export async function POST(request: NextRequest) {
142135
})
143136
}
144137

145-
// Only return 500 for actual server errors, not file processing failures
146-
// File processing failures (like file not found, parsing errors) should return 200 with success:false
147138
return NextResponse.json(result)
148139
} catch (error) {
149140
logger.error('Error in file parse API:', error)
@@ -164,7 +155,6 @@ export async function POST(request: NextRequest) {
164155
async function parseFileSingle(filePath: string, fileType?: string): Promise<ParseResult> {
165156
logger.info('Parsing file:', filePath)
166157

167-
// Validate that filePath is not empty
168158
if (!filePath || filePath.trim() === '') {
169159
return {
170160
success: false,
@@ -173,7 +163,6 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
173163
}
174164
}
175165

176-
// Validate path for security before any processing
177166
const pathValidation = validateFilePath(filePath)
178167
if (!pathValidation.isValid) {
179168
return {
@@ -183,49 +172,40 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
183172
}
184173
}
185174

186-
// Check if this is an external URL
187175
if (filePath.startsWith('http://') || filePath.startsWith('https://')) {
188176
return handleExternalUrl(filePath, fileType)
189177
}
190178

191-
// Check if this is a cloud storage path (S3 or Blob)
192179
const isS3Path = filePath.includes('/api/files/serve/s3/')
193180
const isBlobPath = filePath.includes('/api/files/serve/blob/')
194181

195-
// Use cloud handler if it's a cloud path or we're in cloud mode
196182
if (isS3Path || isBlobPath || isUsingCloudStorage()) {
197183
return handleCloudFile(filePath, fileType)
198184
}
199185

200-
// Use local handler for local files
201186
return handleLocalFile(filePath, fileType)
202187
}
203188

204189
/**
205-
* Validate file path for security
190+
* Validate file path for security - prevents null byte injection and path traversal attacks
206191
*/
207192
function validateFilePath(filePath: string): { isValid: boolean; error?: string } {
208-
// Check for null bytes
209193
if (filePath.includes('\0')) {
210194
return { isValid: false, error: 'Invalid path: null byte detected' }
211195
}
212196

213-
// Check for path traversal attempts
214197
if (filePath.includes('..')) {
215198
return { isValid: false, error: 'Access denied: path traversal detected' }
216199
}
217200

218-
// Check for tilde characters (home directory access)
219201
if (filePath.includes('~')) {
220202
return { isValid: false, error: 'Invalid path: tilde character not allowed' }
221203
}
222204

223-
// Check for absolute paths outside allowed directories
224205
if (filePath.startsWith('/') && !filePath.startsWith('/api/files/serve/')) {
225206
return { isValid: false, error: 'Path outside allowed directory' }
226207
}
227208

228-
// Check for Windows absolute paths
229209
if (/^[A-Za-z]:\\/.test(filePath)) {
230210
return { isValid: false, error: 'Path outside allowed directory' }
231211
}
@@ -260,12 +240,10 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
260240

261241
logger.info(`Downloaded file from URL: ${url}, size: ${buffer.length} bytes`)
262242

263-
// Extract filename from URL
264243
const urlPath = new URL(url).pathname
265244
const filename = urlPath.split('/').pop() || 'download'
266245
const extension = path.extname(filename).toLowerCase().substring(1)
267246

268-
// Process the file based on its content type
269247
if (extension === 'pdf') {
270248
return await handlePdfBuffer(buffer, filename, fileType, url)
271249
}
@@ -276,7 +254,6 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
276254
return await handleGenericTextBuffer(buffer, filename, extension, fileType, url)
277255
}
278256

279-
// For binary or unknown files
280257
return handleGenericBuffer(buffer, filename, extension, fileType)
281258
} catch (error) {
282259
logger.error(`Error handling external URL ${url}:`, error)
@@ -289,58 +266,49 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
289266
}
290267

291268
/**
292-
* Handle file stored in cloud storage (S3 or Azure Blob)
269+
* Handle file stored in cloud storage
293270
*/
294271
async function handleCloudFile(filePath: string, fileType?: string): Promise<ParseResult> {
295272
try {
296-
// Extract the cloud key from the path
297273
let cloudKey: string
298274
if (filePath.includes('/api/files/serve/s3/')) {
299275
cloudKey = decodeURIComponent(filePath.split('/api/files/serve/s3/')[1])
300276
} else if (filePath.includes('/api/files/serve/blob/')) {
301277
cloudKey = decodeURIComponent(filePath.split('/api/files/serve/blob/')[1])
302278
} else if (filePath.startsWith('/api/files/serve/')) {
303-
// Backwards-compatibility: path like "/api/files/serve/<key>"
304279
cloudKey = decodeURIComponent(filePath.substring('/api/files/serve/'.length))
305280
} else {
306-
// Assume raw key provided
307281
cloudKey = filePath
308282
}
309283

310284
logger.info('Extracted cloud key:', cloudKey)
311285

312-
// Download the file from cloud storage - this can throw for access errors
313286
const fileBuffer = await downloadFile(cloudKey)
314287
logger.info(`Downloaded file from cloud storage: ${cloudKey}, size: ${fileBuffer.length} bytes`)
315288

316-
// Extract the filename from the cloud key
317289
const filename = cloudKey.split('/').pop() || cloudKey
318290
const extension = path.extname(filename).toLowerCase().substring(1)
319291

320-
// Process the file based on its content type
321292
if (extension === 'pdf') {
322293
return await handlePdfBuffer(fileBuffer, filename, fileType, filePath)
323294
}
324295
if (extension === 'csv') {
325296
return await handleCsvBuffer(fileBuffer, filename, fileType, filePath)
326297
}
327298
if (isSupportedFileType(extension)) {
328-
// For other supported types that we have parsers for
329299
return await handleGenericTextBuffer(fileBuffer, filename, extension, fileType, filePath)
330300
}
331-
// For binary or unknown files
332301
return handleGenericBuffer(fileBuffer, filename, extension, fileType)
333302
} catch (error) {
334303
logger.error(`Error handling cloud file ${filePath}:`, error)
335304

336-
// Check if this is a download/access error that should trigger a 500 response
305+
// For download/access errors, throw to trigger 500 response
337306
const errorMessage = (error as Error).message
338307
if (errorMessage.includes('Access denied') || errorMessage.includes('Forbidden')) {
339-
// For access errors, throw to trigger 500 response
340308
throw new Error(`Error accessing file from cloud storage: ${errorMessage}`)
341309
}
342310

343-
// For other errors (parsing, processing), return success:false
311+
// For other errors (parsing, processing), return success:false and an error message
344312
return {
345313
success: false,
346314
error: `Error accessing file from cloud storage: ${errorMessage}`,
@@ -354,28 +322,23 @@ async function handleCloudFile(filePath: string, fileType?: string): Promise<Par
354322
*/
355323
async function handleLocalFile(filePath: string, fileType?: string): Promise<ParseResult> {
356324
try {
357-
// Extract filename from path
358325
const filename = filePath.split('/').pop() || filePath
359326
const fullPath = path.join(UPLOAD_DIR_SERVER, filename)
360327

361328
logger.info('Processing local file:', fullPath)
362329

363-
// Check if file exists
364330
try {
365331
await fsPromises.access(fullPath)
366332
} catch {
367333
throw new Error(`File not found: ${filename}`)
368334
}
369335

370-
// Parse the file directly
371336
const result = await parseFile(fullPath)
372337

373-
// Get file stats for metadata
374338
const stats = await fsPromises.stat(fullPath)
375339
const fileBuffer = await readFile(fullPath)
376340
const hash = createHash('md5').update(fileBuffer).digest('hex')
377341

378-
// Extract file extension for type detection
379342
const extension = path.extname(filename).toLowerCase().substring(1)
380343

381344
return {
@@ -386,7 +349,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
386349
fileType: fileType || getMimeType(extension),
387350
size: stats.size,
388351
hash,
389-
processingTime: 0, // Will be set by caller
352+
processingTime: 0,
390353
},
391354
}
392355
} catch (error) {
@@ -425,15 +388,14 @@ async function handlePdfBuffer(
425388
fileType: fileType || 'application/pdf',
426389
size: fileBuffer.length,
427390
hash: createHash('md5').update(fileBuffer).digest('hex'),
428-
processingTime: 0, // Will be set by caller
391+
processingTime: 0,
429392
},
430393
}
431394
} catch (error) {
432395
logger.error('Failed to parse PDF in memory:', error)
433396

434-
// Create fallback message for PDF parsing failure
435397
const content = createPdfFailureMessage(
436-
0, // We can't determine page count without parsing
398+
0,
437399
fileBuffer.length,
438400
originalPath || filename,
439401
(error as Error).message
@@ -447,7 +409,7 @@ async function handlePdfBuffer(
447409
fileType: fileType || 'application/pdf',
448410
size: fileBuffer.length,
449411
hash: createHash('md5').update(fileBuffer).digest('hex'),
450-
processingTime: 0, // Will be set by caller
412+
processingTime: 0,
451413
},
452414
}
453415
}
@@ -465,7 +427,6 @@ async function handleCsvBuffer(
465427
try {
466428
logger.info(`Parsing CSV in memory: ${filename}`)
467429

468-
// Use the parseBuffer function from our library
469430
const { parseBuffer } = await import('@/lib/file-parsers')
470431
const result = await parseBuffer(fileBuffer, 'csv')
471432

@@ -477,7 +438,7 @@ async function handleCsvBuffer(
477438
fileType: fileType || 'text/csv',
478439
size: fileBuffer.length,
479440
hash: createHash('md5').update(fileBuffer).digest('hex'),
480-
processingTime: 0, // Will be set by caller
441+
processingTime: 0,
481442
},
482443
}
483444
} catch (error) {
@@ -490,7 +451,7 @@ async function handleCsvBuffer(
490451
fileType: 'text/csv',
491452
size: 0,
492453
hash: '',
493-
processingTime: 0, // Will be set by caller
454+
processingTime: 0,
494455
},
495456
}
496457
}
@@ -509,7 +470,6 @@ async function handleGenericTextBuffer(
509470
try {
510471
logger.info(`Parsing text file in memory: ${filename}`)
511472

512-
// Try to use a specialized parser if available
513473
try {
514474
const { parseBuffer, isSupportedFileType } = await import('@/lib/file-parsers')
515475

@@ -524,15 +484,14 @@ async function handleGenericTextBuffer(
524484
fileType: fileType || getMimeType(extension),
525485
size: fileBuffer.length,
526486
hash: createHash('md5').update(fileBuffer).digest('hex'),
527-
processingTime: 0, // Will be set by caller
487+
processingTime: 0,
528488
},
529489
}
530490
}
531491
} catch (parserError) {
532492
logger.warn('Specialized parser failed, falling back to generic parsing:', parserError)
533493
}
534494

535-
// Fallback to generic text parsing
536495
const content = fileBuffer.toString('utf-8')
537496

538497
return {
@@ -543,7 +502,7 @@ async function handleGenericTextBuffer(
543502
fileType: fileType || getMimeType(extension),
544503
size: fileBuffer.length,
545504
hash: createHash('md5').update(fileBuffer).digest('hex'),
546-
processingTime: 0, // Will be set by caller
505+
processingTime: 0,
547506
},
548507
}
549508
} catch (error) {
@@ -556,7 +515,7 @@ async function handleGenericTextBuffer(
556515
fileType: 'text/plain',
557516
size: 0,
558517
hash: '',
559-
processingTime: 0, // Will be set by caller
518+
processingTime: 0,
560519
},
561520
}
562521
}
@@ -584,7 +543,7 @@ function handleGenericBuffer(
584543
fileType: fileType || getMimeType(extension),
585544
size: fileBuffer.length,
586545
hash: createHash('md5').update(fileBuffer).digest('hex'),
587-
processingTime: 0, // Will be set by caller
546+
processingTime: 0,
588547
},
589548
}
590549
}
@@ -594,8 +553,6 @@ function handleGenericBuffer(
594553
*/
595554
async function parseBufferAsPdf(buffer: Buffer) {
596555
try {
597-
// Import parsers dynamically to avoid initialization issues in tests
598-
// First try to use the main PDF parser
599556
try {
600557
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
601558
const parser = new PdfParser()
@@ -606,7 +563,6 @@ async function parseBufferAsPdf(buffer: Buffer) {
606563
}
607564
throw new Error('PDF parser does not support buffer parsing')
608565
} catch (error) {
609-
// Fallback to raw PDF parser
610566
logger.warn('Main PDF parser failed, using raw parser for buffer:', error)
611567
const { RawPdfParser } = await import('@/lib/file-parsers/raw-pdf-parser')
612568
const rawParser = new RawPdfParser()
@@ -655,7 +611,7 @@ Please use a PDF viewer for best results.`
655611
}
656612

657613
/**
658-
* Create error message for PDF parsing failure
614+
* Create error message for PDF parsing failure and make it more readable
659615
*/
660616
function createPdfFailureMessage(
661617
pageCount: number,

apps/sim/app/api/knowledge/[id]/documents/[documentId]/tag-definitions/route.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { randomUUID } from 'crypto'
22
import { type NextRequest, NextResponse } from 'next/server'
33
import { z } from 'zod'
44
import { getSession } from '@/lib/auth'
5-
import { SUPPORTED_FIELD_TYPES } from '@/lib/constants/knowledge'
5+
import { SUPPORTED_FIELD_TYPES } from '@/lib/knowledge/consts'
66
import {
77
cleanupUnusedTagDefinitions,
88
createOrUpdateTagDefinitionsBulk,

0 commit comments

Comments
 (0)