@@ -76,11 +76,9 @@ export async function POST(request: NextRequest) {
7676
7777 logger . info ( 'File parse request received:' , { filePath, fileType } )
7878
79- // Handle multiple files
8079 if ( Array . isArray ( filePath ) ) {
8180 const results = [ ]
8281 for ( const path of filePath ) {
83- // Skip empty or invalid paths
8482 if ( ! path || ( typeof path === 'string' && path . trim ( ) === '' ) ) {
8583 results . push ( {
8684 success : false ,
@@ -91,12 +89,10 @@ export async function POST(request: NextRequest) {
9189 }
9290
9391 const result = await parseFileSingle ( path , fileType )
94- // Add processing time to metadata
9592 if ( result . metadata ) {
9693 result . metadata . processingTime = Date . now ( ) - startTime
9794 }
9895
99- // Transform each result to match expected frontend format
10096 if ( result . success ) {
10197 results . push ( {
10298 success : true ,
@@ -105,7 +101,7 @@ export async function POST(request: NextRequest) {
105101 name : result . filePath . split ( '/' ) . pop ( ) || 'unknown' ,
106102 fileType : result . metadata ?. fileType || 'application/octet-stream' ,
107103 size : result . metadata ?. size || 0 ,
108- binary : false , // We only return text content
104+ binary : false ,
109105 } ,
110106 filePath : result . filePath ,
111107 } )
@@ -120,15 +116,12 @@ export async function POST(request: NextRequest) {
120116 } )
121117 }
122118
123- // Handle single file
124119 const result = await parseFileSingle ( filePath , fileType )
125120
126- // Add processing time to metadata
127121 if ( result . metadata ) {
128122 result . metadata . processingTime = Date . now ( ) - startTime
129123 }
130124
131- // Transform single file result to match expected frontend format
132125 if ( result . success ) {
133126 return NextResponse . json ( {
134127 success : true ,
@@ -142,8 +135,6 @@ export async function POST(request: NextRequest) {
142135 } )
143136 }
144137
145- // Only return 500 for actual server errors, not file processing failures
146- // File processing failures (like file not found, parsing errors) should return 200 with success:false
147138 return NextResponse . json ( result )
148139 } catch ( error ) {
149140 logger . error ( 'Error in file parse API:' , error )
@@ -164,7 +155,6 @@ export async function POST(request: NextRequest) {
164155async function parseFileSingle ( filePath : string , fileType ?: string ) : Promise < ParseResult > {
165156 logger . info ( 'Parsing file:' , filePath )
166157
167- // Validate that filePath is not empty
168158 if ( ! filePath || filePath . trim ( ) === '' ) {
169159 return {
170160 success : false ,
@@ -173,7 +163,6 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
173163 }
174164 }
175165
176- // Validate path for security before any processing
177166 const pathValidation = validateFilePath ( filePath )
178167 if ( ! pathValidation . isValid ) {
179168 return {
@@ -183,49 +172,40 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
183172 }
184173 }
185174
186- // Check if this is an external URL
187175 if ( filePath . startsWith ( 'http://' ) || filePath . startsWith ( 'https://' ) ) {
188176 return handleExternalUrl ( filePath , fileType )
189177 }
190178
191- // Check if this is a cloud storage path (S3 or Blob)
192179 const isS3Path = filePath . includes ( '/api/files/serve/s3/' )
193180 const isBlobPath = filePath . includes ( '/api/files/serve/blob/' )
194181
195- // Use cloud handler if it's a cloud path or we're in cloud mode
196182 if ( isS3Path || isBlobPath || isUsingCloudStorage ( ) ) {
197183 return handleCloudFile ( filePath , fileType )
198184 }
199185
200- // Use local handler for local files
201186 return handleLocalFile ( filePath , fileType )
202187}
203188
204189/**
205- * Validate file path for security
190+ * Validate file path for security - prevents null byte injection and path traversal attacks
206191 */
207192function validateFilePath ( filePath : string ) : { isValid : boolean ; error ?: string } {
208- // Check for null bytes
209193 if ( filePath . includes ( '\0' ) ) {
210194 return { isValid : false , error : 'Invalid path: null byte detected' }
211195 }
212196
213- // Check for path traversal attempts
214197 if ( filePath . includes ( '..' ) ) {
215198 return { isValid : false , error : 'Access denied: path traversal detected' }
216199 }
217200
218- // Check for tilde characters (home directory access)
219201 if ( filePath . includes ( '~' ) ) {
220202 return { isValid : false , error : 'Invalid path: tilde character not allowed' }
221203 }
222204
223- // Check for absolute paths outside allowed directories
224205 if ( filePath . startsWith ( '/' ) && ! filePath . startsWith ( '/api/files/serve/' ) ) {
225206 return { isValid : false , error : 'Path outside allowed directory' }
226207 }
227208
228- // Check for Windows absolute paths
229209 if ( / ^ [ A - Z a - z ] : \\ / . test ( filePath ) ) {
230210 return { isValid : false , error : 'Path outside allowed directory' }
231211 }
@@ -260,12 +240,10 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
260240
261241 logger . info ( `Downloaded file from URL: ${ url } , size: ${ buffer . length } bytes` )
262242
263- // Extract filename from URL
264243 const urlPath = new URL ( url ) . pathname
265244 const filename = urlPath . split ( '/' ) . pop ( ) || 'download'
266245 const extension = path . extname ( filename ) . toLowerCase ( ) . substring ( 1 )
267246
268- // Process the file based on its content type
269247 if ( extension === 'pdf' ) {
270248 return await handlePdfBuffer ( buffer , filename , fileType , url )
271249 }
@@ -276,7 +254,6 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
276254 return await handleGenericTextBuffer ( buffer , filename , extension , fileType , url )
277255 }
278256
279- // For binary or unknown files
280257 return handleGenericBuffer ( buffer , filename , extension , fileType )
281258 } catch ( error ) {
282259 logger . error ( `Error handling external URL ${ url } :` , error )
@@ -289,58 +266,49 @@ async function handleExternalUrl(url: string, fileType?: string): Promise<ParseR
289266}
290267
291268/**
292- * Handle file stored in cloud storage (S3 or Azure Blob)
269+ * Handle file stored in cloud storage
293270 */
294271async function handleCloudFile ( filePath : string , fileType ?: string ) : Promise < ParseResult > {
295272 try {
296- // Extract the cloud key from the path
297273 let cloudKey : string
298274 if ( filePath . includes ( '/api/files/serve/s3/' ) ) {
299275 cloudKey = decodeURIComponent ( filePath . split ( '/api/files/serve/s3/' ) [ 1 ] )
300276 } else if ( filePath . includes ( '/api/files/serve/blob/' ) ) {
301277 cloudKey = decodeURIComponent ( filePath . split ( '/api/files/serve/blob/' ) [ 1 ] )
302278 } else if ( filePath . startsWith ( '/api/files/serve/' ) ) {
303- // Backwards-compatibility: path like "/api/files/serve/<key>"
304279 cloudKey = decodeURIComponent ( filePath . substring ( '/api/files/serve/' . length ) )
305280 } else {
306- // Assume raw key provided
307281 cloudKey = filePath
308282 }
309283
310284 logger . info ( 'Extracted cloud key:' , cloudKey )
311285
312- // Download the file from cloud storage - this can throw for access errors
313286 const fileBuffer = await downloadFile ( cloudKey )
314287 logger . info ( `Downloaded file from cloud storage: ${ cloudKey } , size: ${ fileBuffer . length } bytes` )
315288
316- // Extract the filename from the cloud key
317289 const filename = cloudKey . split ( '/' ) . pop ( ) || cloudKey
318290 const extension = path . extname ( filename ) . toLowerCase ( ) . substring ( 1 )
319291
320- // Process the file based on its content type
321292 if ( extension === 'pdf' ) {
322293 return await handlePdfBuffer ( fileBuffer , filename , fileType , filePath )
323294 }
324295 if ( extension === 'csv' ) {
325296 return await handleCsvBuffer ( fileBuffer , filename , fileType , filePath )
326297 }
327298 if ( isSupportedFileType ( extension ) ) {
328- // For other supported types that we have parsers for
329299 return await handleGenericTextBuffer ( fileBuffer , filename , extension , fileType , filePath )
330300 }
331- // For binary or unknown files
332301 return handleGenericBuffer ( fileBuffer , filename , extension , fileType )
333302 } catch ( error ) {
334303 logger . error ( `Error handling cloud file ${ filePath } :` , error )
335304
336- // Check if this is a download/access error that should trigger a 500 response
305+ // For download/access errors, throw to trigger 500 response
337306 const errorMessage = ( error as Error ) . message
338307 if ( errorMessage . includes ( 'Access denied' ) || errorMessage . includes ( 'Forbidden' ) ) {
339- // For access errors, throw to trigger 500 response
340308 throw new Error ( `Error accessing file from cloud storage: ${ errorMessage } ` )
341309 }
342310
343- // For other errors (parsing, processing), return success:false
311+ // For other errors (parsing, processing), return success:false and an error message
344312 return {
345313 success : false ,
346314 error : `Error accessing file from cloud storage: ${ errorMessage } ` ,
@@ -354,28 +322,23 @@ async function handleCloudFile(filePath: string, fileType?: string): Promise<Par
354322 */
355323async function handleLocalFile ( filePath : string , fileType ?: string ) : Promise < ParseResult > {
356324 try {
357- // Extract filename from path
358325 const filename = filePath . split ( '/' ) . pop ( ) || filePath
359326 const fullPath = path . join ( UPLOAD_DIR_SERVER , filename )
360327
361328 logger . info ( 'Processing local file:' , fullPath )
362329
363- // Check if file exists
364330 try {
365331 await fsPromises . access ( fullPath )
366332 } catch {
367333 throw new Error ( `File not found: ${ filename } ` )
368334 }
369335
370- // Parse the file directly
371336 const result = await parseFile ( fullPath )
372337
373- // Get file stats for metadata
374338 const stats = await fsPromises . stat ( fullPath )
375339 const fileBuffer = await readFile ( fullPath )
376340 const hash = createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' )
377341
378- // Extract file extension for type detection
379342 const extension = path . extname ( filename ) . toLowerCase ( ) . substring ( 1 )
380343
381344 return {
@@ -386,7 +349,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
386349 fileType : fileType || getMimeType ( extension ) ,
387350 size : stats . size ,
388351 hash,
389- processingTime : 0 , // Will be set by caller
352+ processingTime : 0 ,
390353 } ,
391354 }
392355 } catch ( error ) {
@@ -425,15 +388,14 @@ async function handlePdfBuffer(
425388 fileType : fileType || 'application/pdf' ,
426389 size : fileBuffer . length ,
427390 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
428- processingTime : 0 , // Will be set by caller
391+ processingTime : 0 ,
429392 } ,
430393 }
431394 } catch ( error ) {
432395 logger . error ( 'Failed to parse PDF in memory:' , error )
433396
434- // Create fallback message for PDF parsing failure
435397 const content = createPdfFailureMessage (
436- 0 , // We can't determine page count without parsing
398+ 0 ,
437399 fileBuffer . length ,
438400 originalPath || filename ,
439401 ( error as Error ) . message
@@ -447,7 +409,7 @@ async function handlePdfBuffer(
447409 fileType : fileType || 'application/pdf' ,
448410 size : fileBuffer . length ,
449411 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
450- processingTime : 0 , // Will be set by caller
412+ processingTime : 0 ,
451413 } ,
452414 }
453415 }
@@ -465,7 +427,6 @@ async function handleCsvBuffer(
465427 try {
466428 logger . info ( `Parsing CSV in memory: ${ filename } ` )
467429
468- // Use the parseBuffer function from our library
469430 const { parseBuffer } = await import ( '@/lib/file-parsers' )
470431 const result = await parseBuffer ( fileBuffer , 'csv' )
471432
@@ -477,7 +438,7 @@ async function handleCsvBuffer(
477438 fileType : fileType || 'text/csv' ,
478439 size : fileBuffer . length ,
479440 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
480- processingTime : 0 , // Will be set by caller
441+ processingTime : 0 ,
481442 } ,
482443 }
483444 } catch ( error ) {
@@ -490,7 +451,7 @@ async function handleCsvBuffer(
490451 fileType : 'text/csv' ,
491452 size : 0 ,
492453 hash : '' ,
493- processingTime : 0 , // Will be set by caller
454+ processingTime : 0 ,
494455 } ,
495456 }
496457 }
@@ -509,7 +470,6 @@ async function handleGenericTextBuffer(
509470 try {
510471 logger . info ( `Parsing text file in memory: ${ filename } ` )
511472
512- // Try to use a specialized parser if available
513473 try {
514474 const { parseBuffer, isSupportedFileType } = await import ( '@/lib/file-parsers' )
515475
@@ -524,15 +484,14 @@ async function handleGenericTextBuffer(
524484 fileType : fileType || getMimeType ( extension ) ,
525485 size : fileBuffer . length ,
526486 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
527- processingTime : 0 , // Will be set by caller
487+ processingTime : 0 ,
528488 } ,
529489 }
530490 }
531491 } catch ( parserError ) {
532492 logger . warn ( 'Specialized parser failed, falling back to generic parsing:' , parserError )
533493 }
534494
535- // Fallback to generic text parsing
536495 const content = fileBuffer . toString ( 'utf-8' )
537496
538497 return {
@@ -543,7 +502,7 @@ async function handleGenericTextBuffer(
543502 fileType : fileType || getMimeType ( extension ) ,
544503 size : fileBuffer . length ,
545504 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
546- processingTime : 0 , // Will be set by caller
505+ processingTime : 0 ,
547506 } ,
548507 }
549508 } catch ( error ) {
@@ -556,7 +515,7 @@ async function handleGenericTextBuffer(
556515 fileType : 'text/plain' ,
557516 size : 0 ,
558517 hash : '' ,
559- processingTime : 0 , // Will be set by caller
518+ processingTime : 0 ,
560519 } ,
561520 }
562521 }
@@ -584,7 +543,7 @@ function handleGenericBuffer(
584543 fileType : fileType || getMimeType ( extension ) ,
585544 size : fileBuffer . length ,
586545 hash : createHash ( 'md5' ) . update ( fileBuffer ) . digest ( 'hex' ) ,
587- processingTime : 0 , // Will be set by caller
546+ processingTime : 0 ,
588547 } ,
589548 }
590549}
@@ -594,8 +553,6 @@ function handleGenericBuffer(
594553 */
595554async function parseBufferAsPdf ( buffer : Buffer ) {
596555 try {
597- // Import parsers dynamically to avoid initialization issues in tests
598- // First try to use the main PDF parser
599556 try {
600557 const { PdfParser } = await import ( '@/lib/file-parsers/pdf-parser' )
601558 const parser = new PdfParser ( )
@@ -606,7 +563,6 @@ async function parseBufferAsPdf(buffer: Buffer) {
606563 }
607564 throw new Error ( 'PDF parser does not support buffer parsing' )
608565 } catch ( error ) {
609- // Fallback to raw PDF parser
610566 logger . warn ( 'Main PDF parser failed, using raw parser for buffer:' , error )
611567 const { RawPdfParser } = await import ( '@/lib/file-parsers/raw-pdf-parser' )
612568 const rawParser = new RawPdfParser ( )
@@ -655,7 +611,7 @@ Please use a PDF viewer for best results.`
655611}
656612
657613/**
658- * Create error message for PDF parsing failure
614+ * Create error message for PDF parsing failure and make it more readable
659615 */
660616function createPdfFailureMessage (
661617 pageCount : number ,
0 commit comments