simstudioai
diff --git a/‎apps/sim/app/api/knowledge/utils.test.ts‎
Lines changed: 10 additions & 0 deletions b/‎apps/sim/app/api/knowledge/utils.test.ts‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx‎
Lines changed: 9 additions & 35 deletions b/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx‎
Lines changed: 9 additions & 35 deletions
diff --git a/‎apps/sim/background/knowledge-processing.ts‎
Lines changed: 1 addition & 0 deletions b/‎apps/sim/background/knowledge-processing.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/sim/hooks/queries/knowledge.ts‎
Lines changed: 2 additions & 0 deletions b/‎apps/sim/hooks/queries/knowledge.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/sim/hooks/use-knowledge.ts‎
Lines changed: 2 additions & 0 deletions b/‎apps/sim/hooks/use-knowledge.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/sim/lib/chunkers/text-chunker.ts‎
Lines changed: 4 additions & 2 deletions b/‎apps/sim/lib/chunkers/text-chunker.ts‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎apps/sim/lib/core/config/env.ts‎
Lines changed: 1 addition & 0 deletions b/‎apps/sim/lib/core/config/env.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/sim/lib/file-parsers/doc-parser.ts‎
Lines changed: 57 additions & 32 deletions b/‎apps/sim/lib/file-parsers/doc-parser.ts‎
Lines changed: 57 additions & 32 deletions
diff --git a/‎apps/sim/lib/file-parsers/docx-parser.ts‎
Lines changed: 62 additions & 15 deletions b/‎apps/sim/lib/file-parsers/docx-parser.ts‎
Lines changed: 62 additions & 15 deletions
@@ -136,6 +136,16 @@ vi.mock('@sim/db', () => {
           },
         }),
       }),
+      delete: () => ({
+        where: () => Promise.resolve(),
+      }),
+      insert: (table: any) => ({
+        values: (records: any) => {
+          dbOps.order.push('insert')
+          dbOps.insertRecords.push(records)
+          return Promise.resolve()
+        },
+      }),
       transaction: vi.fn(async (fn: any) => {
         await fn({
           insert: (table: any) => ({
 
@@ -453,6 +453,8 @@ export function KnowledgeBase({
     error: knowledgeBaseError,
     refresh: refreshKnowledgeBase,
   } = useKnowledgeBase(id)
+  const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false)
+
   const {
     documents,
     pagination,
@@ -468,6 +470,7 @@ export function KnowledgeBase({
     offset: (currentPage - 1) * DOCUMENTS_PER_PAGE,
     sortBy,
     sortOrder,
+    refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false,
   })
 
   const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id)
@@ -534,25 +537,15 @@ export function KnowledgeBase({
   )
 
   useEffect(() => {
-    const hasProcessingDocuments = documents.some(
+    const processing = documents.some(
       (doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing'
     )
+    setHasProcessingDocuments(processing)
 
-    if (!hasProcessingDocuments) return
-
-    const refreshInterval = setInterval(async () => {
-      try {
-        if (!isDeleting) {
-          await checkForDeadProcesses()
-          await refreshDocuments()
-        }
-      } catch (error) {
-        logger.error('Error refreshing documents:', error)
-      }
-    }, 3000)
-
-    return () => clearInterval(refreshInterval)
-  }, [documents, refreshDocuments, isDeleting])
+    if (processing) {
+      checkForDeadProcesses()
+    }
+  }, [documents])
 
   /**
    * Checks for documents with stale processing states and marks them as failed
@@ -672,25 +665,6 @@ export function KnowledgeBase({
 
       await refreshDocuments()
 
-      let refreshAttempts = 0
-      const maxRefreshAttempts = 3
-      const refreshInterval = setInterval(async () => {
-        try {
-          refreshAttempts++
-          await refreshDocuments()
-          if (refreshAttempts >= maxRefreshAttempts) {
-            clearInterval(refreshInterval)
-          }
-        } catch (error) {
-          logger.error('Error refreshing documents after retry:', error)
-          clearInterval(refreshInterval)
-        }
-      }, 1000)
-
-      setTimeout(() => {
-        clearInterval(refreshInterval)
-      }, 4000)
-
       logger.info(`Document retry initiated successfully for: ${docId}`)
     } catch (err) {
       logger.error('Error retrying document:', err)
 
@@ -27,6 +27,7 @@ export type DocumentProcessingPayload = {
 export const processDocument = task({
   id: 'knowledge-process-document',
   maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
+  machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing
   retry: {
     maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
     factor: env.KB_CONFIG_RETRY_FACTOR || 2,
 
@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(
   params: KnowledgeDocumentsParams,
   options?: {
     enabled?: boolean
+    refetchInterval?: number | false
   }
 ) {
   const paramsKey = serializeDocumentParams(params)
@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(
     enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),
     staleTime: 60 * 1000,
     placeholderData: keepPreviousData,
+    refetchInterval: options?.refetchInterval ?? false,
   })
 }
 
 
@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(
     sortBy?: string
     sortOrder?: string
     enabled?: boolean
+    refetchInterval?: number | false
   }
 ) {
   const queryClient = useQueryClient()
@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(
     },
     {
       enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),
+      refetchInterval: options?.refetchInterval,
     }
   )
 
 
@@ -110,10 +110,12 @@ export class TextChunker {
           chunks.push(currentChunk.trim())
         }
 
-        // Start new chunk with current part
         // If part itself is too large, split it further
         if (this.estimateTokens(part) > this.chunkSize) {
-          chunks.push(...(await this.splitRecursively(part, separatorIndex + 1)))
+          const subChunks = await this.splitRecursively(part, separatorIndex + 1)
+          for (const subChunk of subChunks) {
+            chunks.push(subChunk)
+          }
           currentChunk = ''
         } else {
           currentChunk = part
 
@@ -178,6 +178,7 @@ export const env = createEnv({
     KB_CONFIG_BATCH_SIZE:                  z.number().optional().default(2000),    // Chunks to process per embedding batch
     KB_CONFIG_DELAY_BETWEEN_BATCHES:       z.number().optional().default(0),       // Delay between batches in ms (0 for max speed)
     KB_CONFIG_DELAY_BETWEEN_DOCUMENTS:     z.number().optional().default(50),      // Delay between documents in ms
+    KB_CONFIG_CHUNK_CONCURRENCY:           z.number().optional().default(10),      // Concurrent PDF chunk OCR processing
 
     // Real-time Communication
     SOCKET_SERVER_URL:                     z.string().url().optional(),            // WebSocket server URL for real-time features
 
@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
         throw new Error(`File not found: ${filePath}`)
       }
 
-      logger.info(`Parsing DOC file: ${filePath}`)
-
       const buffer = await readFile(filePath)
       return this.parseBuffer(buffer)
     } catch (error) {
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {
 
   async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
     try {
-      logger.info('Parsing DOC buffer, size:', buffer.length)
-
       if (!buffer || buffer.length === 0) {
         throw new Error('Empty buffer provided')
       }
 
-      let parseOfficeAsync
       try {
         const officeParser = await import('officeparser')
-        parseOfficeAsync = officeParser.parseOfficeAsync
-      } catch (importError) {
-        logger.warn('officeparser not available, using fallback extraction')
-        return this.fallbackExtraction(buffer)
+        const result = await officeParser.parseOfficeAsync(buffer)
+
+        if (result) {
+          const resultString = typeof result === 'string' ? result : String(result)
+          const content = sanitizeTextForUTF8(resultString.trim())
+
+          if (content.length > 0) {
+            return {
+              content,
+              metadata: {
+                characterCount: content.length,
+                extractionMethod: 'officeparser',
+              },
+            }
+          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed, trying mammoth:', officeError)
       }
 
       try {
-        const result = await parseOfficeAsync(buffer)
-
-        if (!result) {
-          throw new Error('officeparser returned no result')
+        const mammoth = await import('mammoth')
+        const result = await mammoth.extractRawText({ buffer })
+
+        if (result.value && result.value.trim().length > 0) {
+          const content = sanitizeTextForUTF8(result.value.trim())
+          return {
+            content,
+            metadata: {
+              characterCount: content.length,
+              extractionMethod: 'mammoth',
+              messages: result.messages,
+            },
+          }
         }
-
-        const resultString = typeof result === 'string' ? result : String(result)
-
-        const content = sanitizeTextForUTF8(resultString.trim())
-
-        logger.info('DOC parsing completed successfully with officeparser')
-
-        return {
-          content: content,
-          metadata: {
-            characterCount: content.length,
-            extractionMethod: 'officeparser',
-          },
-        }
-      } catch (extractError) {
-        logger.warn('officeparser failed, using fallback:', extractError)
-        return this.fallbackExtraction(buffer)
+      } catch (mammothError) {
+        logger.warn('mammoth failed:', mammothError)
       }
+
+      return this.fallbackExtraction(buffer)
     } catch (error) {
-      logger.error('DOC buffer parsing error:', error)
+      logger.error('DOC parsing error:', error)
       throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`)
     }
   }
 
   private fallbackExtraction(buffer: Buffer): FileParseResult {
-    logger.info('Using fallback text extraction for DOC file')
+    const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
+
+    if (!isBinaryDoc) {
+      const textContent = buffer.toString('utf8').trim()
+
+      if (textContent.length > 0) {
+        const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
+        const isProbablyText = printableChars / textContent.length > 0.9
+
+        if (isProbablyText) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOC format, extracted as plain text',
+            },
+          }
+        }
+      }
+    }
 
     const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))
 
 
@@ -2,10 +2,10 @@ import { readFile } from 'fs/promises'
 import { createLogger } from '@sim/logger'
 import mammoth from 'mammoth'
 import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
+import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'
 
 const logger = createLogger('DocxParser')
 
-// Define interface for mammoth result
 interface MammothResult {
   value: string
   messages: any[]
@@ -19,7 +19,6 @@ export class DocxParser implements FileParser {
       }
 
       const buffer = await readFile(filePath)
-
       return this.parseBuffer(buffer)
     } catch (error) {
       logger.error('DOCX file error:', error)
@@ -29,26 +28,74 @@ export class DocxParser implements FileParser {
 
   async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
     try {
-      logger.info('Parsing buffer, size:', buffer.length)
+      if (!buffer || buffer.length === 0) {
+        throw new Error('Empty buffer provided')
+      }
 
-      const result = await mammoth.extractRawText({ buffer })
+      try {
+        const result = await mammoth.extractRawText({ buffer })
+
+        if (result.value && result.value.trim().length > 0) {
+          let htmlResult: MammothResult = { value: '', messages: [] }
+          try {
+            htmlResult = await mammoth.convertToHtml({ buffer })
+          } catch {
+            // HTML conversion is optional
+          }
+
+          return {
+            content: sanitizeTextForUTF8(result.value),
+            metadata: {
+              extractionMethod: 'mammoth',
+              messages: [...result.messages, ...htmlResult.messages],
+              html: htmlResult.value,
+            },
+          }
+        }
+      } catch (mammothError) {
+        logger.warn('mammoth failed, trying officeparser:', mammothError)
+      }
 
-      let htmlResult: MammothResult = { value: '', messages: [] }
       try {
-        htmlResult = await mammoth.convertToHtml({ buffer })
-      } catch (htmlError) {
-        logger.warn('HTML conversion warning:', htmlError)
+        const officeParser = await import('officeparser')
+        const result = await officeParser.parseOfficeAsync(buffer)
+
+        if (result) {
+          const resultString = typeof result === 'string' ? result : String(result)
+          const content = sanitizeTextForUTF8(resultString.trim())
+
+          if (content.length > 0) {
+            return {
+              content,
+              metadata: {
+                extractionMethod: 'officeparser',
+                characterCount: content.length,
+              },
+            }
+          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed:', officeError)
       }
 
-      return {
-        content: result.value,
-        metadata: {
-          messages: [...result.messages, ...htmlResult.messages],
-          html: htmlResult.value,
-        },
+      const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b
+      if (!isZipFile) {
+        const textContent = buffer.toString('utf8').trim()
+        if (textContent.length > 0) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOCX format, extracted as plain text',
+            },
+          }
+        }
       }
+
+      throw new Error('Failed to extract text from DOCX file')
     } catch (error) {
-      logger.error('DOCX buffer parsing error:', error)
+      logger.error('DOCX parsing error:', error)
       throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`)
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(`
`228`	`228`	`params: KnowledgeDocumentsParams,`
`229`	`229`	`options?: {`
`230`	`230`	`enabled?: boolean`
	`231`	`+ refetchInterval?: number \| false`
`231`	`232`	`}`
`232`	`233`	`) {`
`233`	`234`	`const paramsKey = serializeDocumentParams(params)`
`@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(`
`237`	`238`	`enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),`
`238`	`239`	`staleTime: 60 * 1000,`
`239`	`240`	`placeholderData: keepPreviousData,`
	`241`	`+ refetchInterval: options?.refetchInterval ?? false,`
`240`	`242`	`})`
`241`	`243`	`}`
`242`	`244`
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(`
`67`	`67`	`sortBy?: string`
`68`	`68`	`sortOrder?: string`
`69`	`69`	`enabled?: boolean`
	`70`	`+ refetchInterval?: number \| false`
`70`	`71`	`}`
`71`	`72`	`) {`
`72`	`73`	`const queryClient = useQueryClient()`
`@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(`
`92`	`93`	`},`
`93`	`94`	`{`
`94`	`95`	`enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),`
	`96`	`+ refetchInterval: options?.refetchInterval,`
`95`	`97`	`}`
`96`	`98`	`)`
`97`	`99`