@@ -4,6 +4,7 @@ import { getBYOKKey } from '@/lib/api-key/byok'
44import { type Chunk , JsonYamlChunker , StructuredDataChunker , TextChunker } from '@/lib/chunkers'
55import { env } from '@/lib/core/config/env'
66import { parseBuffer , parseFile } from '@/lib/file-parsers'
7+ import type { FileParseMetadata } from '@/lib/file-parsers/types'
78import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
89import { StorageService } from '@/lib/uploads'
910import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
@@ -35,7 +36,6 @@ type OCRRequestBody = {
3536 document_url : string
3637 }
3738 include_image_base64 : boolean
38- pages ?: number [ ]
3939}
4040
4141const MISTRAL_MAX_PAGES = 1000
@@ -138,7 +138,7 @@ export async function processDocument(
138138 const cloudUrl = 'cloudUrl' in parseResult ? parseResult . cloudUrl : undefined
139139
140140 let chunks : Chunk [ ]
141- const metadata = 'metadata' in parseResult ? parseResult . metadata : { }
141+ const metadata : FileParseMetadata = parseResult . metadata ?? { }
142142
143143 const isJsonYaml =
144144 metadata . type === 'json' ||
@@ -154,10 +154,11 @@ export async function processDocument(
154154 } )
155155 } else if ( StructuredDataChunker . isStructuredData ( content , mimeType ) ) {
156156 logger . info ( 'Using structured data chunker for spreadsheet/CSV content' )
157+ const rowCount = metadata . totalRows ?? metadata . rowCount
157158 chunks = await StructuredDataChunker . chunkStructuredData ( content , {
158159 chunkSize,
159160 headers : metadata . headers ,
160- totalRows : metadata . totalRows || metadata . rowCount ,
161+ totalRows : typeof rowCount === 'number' ? rowCount : undefined ,
161162 sheetName : metadata . sheetNames ?. [ 0 ] ,
162163 } )
163164 } else {
@@ -210,7 +211,7 @@ async function parseDocument(
210211 content : string
211212 processingMethod : 'file-parser' | 'mistral-ocr'
212213 cloudUrl ?: string
213- metadata ?: any
214+ metadata ?: FileParseMetadata
214215} > {
215216 const isPDF = mimeType === 'application/pdf'
216217 const hasAzureMistralOCR =
@@ -663,7 +664,7 @@ async function processChunk(
663664}
664665
665666// Maximum concurrent chunk processing to avoid overwhelming APIs
666- const MAX_CONCURRENT_CHUNKS = env . KB_CONFIG_CHUNK_CONCURRENCY || 5
667+ const MAX_CONCURRENT_CHUNKS = env . KB_CONFIG_CHUNK_CONCURRENCY
667668
668669async function processMistralOCRInBatches (
669670 filename : string ,
@@ -736,7 +737,7 @@ async function processMistralOCRInBatches(
736737async function parseWithFileParser ( fileUrl : string , filename : string , mimeType : string ) {
737738 try {
738739 let content : string
739- let metadata : any = { }
740+ let metadata : FileParseMetadata = { }
740741
741742 if ( fileUrl . startsWith ( 'data:' ) ) {
742743 content = await parseDataURI ( fileUrl , filename , mimeType )
@@ -782,7 +783,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
782783async function parseHttpFile (
783784 fileUrl : string ,
784785 filename : string
785- ) : Promise < { content : string ; metadata ?: any } > {
786+ ) : Promise < { content : string ; metadata ?: FileParseMetadata } > {
786787 const buffer = await downloadFileWithTimeout ( fileUrl )
787788
788789 const extension = filename . split ( '.' ) . pop ( ) ?. toLowerCase ( )
0 commit comments